SAT404/assignment_3/refs.bib

@misc{avetisyanSceneScriptReconstructingScenes2024,
  title         = {{{SceneScript}}: {{Reconstructing Scenes With An Autoregressive Structured Language Model}}},
  shorttitle    = {{{SceneScript}}},
  author        = {Avetisyan, Armen and Xie, Christopher and {Howard-Jenkins}, Henry and Yang, Tsun-Yi and Aroudj, Samir and Patra, Suvam and Zhang, Fuyang and Frost, Duncan and Holland, Luke and Orme, Campbell and Engel, Jakob and Miller, Edward and Newcombe, Richard and Balntas, Vasileios},
  year          = {2024},
  month         = mar,
  number        = {arXiv:2403.13064},
  eprint        = {2403.13064},
  primaryclass  = {cs},
  publisher     = {arXiv},
  doi           = {10.48550/arXiv.2403.13064},
  urldate       = {2025-03-21},
  abstract      = {We introduce SceneScript, a method that directly produces full scene models as a sequence of structured language commands using an autoregressive, token-based approach. Our proposed scene representation is inspired by recent successes in transformers \& LLMs, and departs from more traditional methods which commonly describe scenes as meshes, voxel grids, point clouds or radiance fields. Our method infers the set of structured language commands directly from encoded visual data using a scene language encoder-decoder architecture. To train SceneScript, we generate and release a large-scale synthetic dataset called Aria Synthetic Environments consisting of 100k high-quality in-door scenes, with photorealistic and ground-truth annotated renders of egocentric scene walkthroughs. Our method gives state-of-the art results in architectural layout estimation, and competitive results in 3D object detection. Lastly, we explore an advantage for SceneScript, which is the ability to readily adapt to new commands via simple additions to the structured language, which we illustrate for tasks such as coarse 3D object part reconstruction.},
  archiveprefix = {arXiv},
  langid        = {american},
  keywords      = {Computer Science - Computer Vision and Pattern Recognition},
  file          = {C\:\\Users\\Dustella\\Zotero\\storage\\J4XWBZUJ\\Avetisyan et al. - 2024 - SceneScript Reconstructing Scenes With An Autoregressive Structured Language Model.pdf;C\:\\Users\\Dustella\\Zotero\\storage\\FRUKNIKI\\2403.html}
}

@misc{calvoTimePillarsTemporallyRecurrent3D2023,
  title         = {{{TimePillars}}: {{Temporally-Recurrent 3D LiDAR Object Detection}}},
  shorttitle    = {{{TimePillars}}},
  author        = {Calvo, Ernesto Lozano and Taveira, Bernardo and Kahl, Fredrik and Gustafsson, Niklas and Larsson, Jonathan and Tonderski, Adam},
  year          = {2023},
  month         = dec,
  number        = {arXiv:2312.17260},
  eprint        = {2312.17260},
  primaryclass  = {cs},
  publisher     = {arXiv},
  urldate       = {2024-10-24},
  abstract      = {Object detection applied to LiDAR point clouds is a relevant task in robotics, and particularly in autonomous driving. Single frame methods, predominant in the field, exploit information from individual sensor scans. Recent approaches achieve good performance, at relatively low inference time. Nevertheless, given the inherent high sparsity of LiDAR data, these methods struggle in long-range detection (e.g. 200m) which we deem to be critical in achieving safe automation. Aggregating multiple scans not only leads to a denser point cloud representation, but it also brings time-awareness to the system, and provides information about how the environment is changing. Solutions of this kind, however, are often highly problem-specific, demand careful data processing, and tend not to fulfil runtime requirements. In this context we propose TimePillars, a temporally-recurrent object detection pipeline which leverages the pillar representation of LiDAR data across time, respecting hardware integration efficiency constraints, and exploiting the diversity and long-range information of the novel Zenseact Open Dataset (ZOD). Through experimentation, we prove the benefits of having recurrency, and show how basic building blocks are enough to achieve robust and efficient results.},
  archiveprefix = {arXiv},
  langid        = {english},
  keywords      = {Computer Science - Computer Vision and Pattern Recognition,Computer Science - Machine Learning,Computer Science - Robotics},
  file          = {C:\Users\Dustella\Zotero\storage\H98SXJK8\Calvo et al. - 2023 - TimePillars Temporally-Recurrent 3D LiDAR Object Detection.pdf}
}

@misc{chenPointGPTAutoregressivelyGenerative2023,
  title         = {{{PointGPT}}: {{Auto-regressively Generative Pre-training}} from {{Point Clouds}}},
  shorttitle    = {{{PointGPT}}},
  author        = {Chen, Guangyan and Wang, Meiling and Yang, Yi and Yu, Kai and Yuan, Li and Yue, Yufeng},
  year          = {2023},
  month         = may,
  number        = {arXiv:2305.11487},
  eprint        = {2305.11487},
  primaryclass  = {cs},
  publisher     = {arXiv},
  doi           = {10.48550/arXiv.2305.11487},
  urldate       = {2025-02-28},
  abstract      = {Large language models (LLMs) based on the generative pre-training transformer (GPT) have demonstrated remarkable effectiveness across a diverse range of downstream tasks. Inspired by the advancements of the GPT, we present PointGPT, a novel approach that extends the concept of GPT to point clouds, addressing the challenges associated with disorder properties, low information density, and task gaps. Specifically, a point cloud auto-regressive generation task is proposed to pre-train transformer models. Our method partitions the input point cloud into multiple point patches and arranges them in an ordered sequence based on their spatial proximity. Then, an extractor-generator based transformer decoder, with a dual masking strategy, learns latent representations conditioned on the preceding point patches, aiming to predict the next one in an auto-regressive manner. Our scalable approach allows for learning high-capacity models that generalize well, achieving state-of-the-art performance on various downstream tasks. In particular, our approach achieves classification accuracies of 94.9\% on the ModelNet40 dataset and 93.4\% on the ScanObjectNN dataset, outperforming all other transformer models. Furthermore, our method also attains new state-of-the-art accuracies on all four few-shot learning benchmarks.},
  archiveprefix = {arXiv},
  keywords      = {Computer Science - Computer Vision and Pattern Recognition},
  file          = {C\:\\Users\\Dustella\\Zotero\\storage\\PGKQM2FB\\Chen et al. - 2023 - PointGPT Auto-regressively Generative Pre-training from Point Clouds.pdf;C\:\\Users\\Dustella\\Zotero\\storage\\MA6UX2FJ\\2305.html}
}

@misc{daiScanNetRichlyannotated3D2017,
  title         = {{{ScanNet}}: {{Richly-annotated 3D Reconstructions}} of {{Indoor Scenes}}},
  shorttitle    = {{{ScanNet}}},
  author        = {Dai, Angela and Chang, Angel X. and Savva, Manolis and Halber, Maciej and Funkhouser, Thomas and Nie{\ss}ner, Matthias},
  year          = {2017},
  month         = apr,
  number        = {arXiv:1702.04405},
  eprint        = {1702.04405},
  primaryclass  = {cs},
  publisher     = {arXiv},
  doi           = {10.48550/arXiv.1702.04405},
  urldate       = {2025-03-27},
  abstract      = {A key requirement for leveraging supervised deep learning methods is the availability of large, labeled datasets. Unfortunately, in the context of RGB-D scene understanding, very little data is available -- current datasets cover a small range of scene views and have limited semantic annotations. To address this issue, we introduce ScanNet, an RGB-D video dataset containing 2.5M views in 1513 scenes annotated with 3D camera poses, surface reconstructions, and semantic segmentations. To collect this data, we designed an easy-to-use and scalable RGB-D capture system that includes automated surface reconstruction and crowdsourced semantic annotation. We show that using this data helps achieve state-of-the-art performance on several 3D scene understanding tasks, including 3D object classification, semantic voxel labeling, and CAD model retrieval. The dataset is freely available at http://www.scan-net.org.},
  archiveprefix = {arXiv},
  langid        = {american},
  keywords      = {Computer Science - Computer Vision and Pattern Recognition},
  file          = {C\:\\Users\\Dustella\\Zotero\\storage\\D4A5TPDK\\Dai et al. - 2017 - ScanNet Richly-annotated 3D Reconstructions of Indoor Scenes.pdf;C\:\\Users\\Dustella\\Zotero\\storage\\4MLTF8R5\\1702.html}
}

@misc{dingLENetLightweightEfficient2023,
  title         = {{{LENet}}: {{Lightweight And Efficient LiDAR Semantic Segmentation Using Multi-Scale Convolution Attention}}},
  shorttitle    = {{{LENet}}},
  author        = {Ding, Ben},
  year          = {2023},
  month         = jun,
  number        = {arXiv:2301.04275},
  eprint        = {2301.04275},
  publisher     = {arXiv},
  doi           = {10.48550/arXiv.2301.04275},
  urldate       = {2024-11-25},
  abstract      = {LiDAR-based semantic segmentation is critical in the fields of robotics and autonomous driving as it provides a comprehensive understanding of the scene. This paper proposes a lightweight and efficient projection-based semantic segmentation network called LENet with an encoder-decoder structure for LiDAR-based semantic segmentation. The encoder is composed of a novel multi-scale convolutional attention (MSCA) module with varying receptive field sizes to capture features. The decoder employs an Interpolation And Convolution (IAC) mechanism utilizing bilinear interpolation for upsampling multi-resolution feature maps and integrating previous and current dimensional features through a single convolution layer. This approach significantly reduces the network's complexity while also improving its accuracy. Additionally, we introduce multiple auxiliary segmentation heads to further refine the network's accuracy. Extensive evaluations on publicly available datasets, including SemanticKITTI, SemanticPOSS, and nuScenes, show that our proposed method is lighter, more efficient, and robust compared to state-of-the-art semantic segmentation methods. Full implementation is available at https://github.com/fengluodb/LENet.},
  archiveprefix = {arXiv},
  keywords      = {Computer Science - Computer Vision and Pattern Recognition},
  file          = {C\:\\Users\\Dustella\\Zotero\\storage\\AITRCJIK\\Ding - 2023 - LENet Lightweight And Efficient LiDAR Semantic Segmentation Using Multi-Scale Convolution Attention.pdf;C\:\\Users\\Dustella\\Zotero\\storage\\SS3LNQJC\\2301.html}
}

@misc{guoDeepLearning3D2020,
  title         = {Deep {{Learning}} for {{3D Point Clouds}}: {{A Survey}}},
  shorttitle    = {Deep {{Learning}} for {{3D Point Clouds}}},
  author        = {Guo, Yulan and Wang, Hanyun and Hu, Qingyong and Liu, Hao and Liu, Li and Bennamoun, Mohammed},
  year          = {2020},
  month         = jun,
  number        = {arXiv:1912.12033},
  eprint        = {1912.12033},
  primaryclass  = {cs},
  publisher     = {arXiv},
  urldate       = {2024-11-21},
  abstract      = {Point cloud learning has lately attracted increasing attention due to its wide applications in many areas, such as computer vision, autonomous driving, and robotics. As a dominating technique in AI, deep learning has been successfully used to solve various 2D vision problems. However, deep learning on point clouds is still in its infancy due to the unique challenges faced by the processing of point clouds with deep neural networks. Recently, deep learning on point clouds has become even thriving, with numerous methods being proposed to address different problems in this area. To stimulate future research, this paper presents a comprehensive review of recent progress in deep learning methods for point clouds. It covers three major tasks, including 3D shape classification, 3D object detection and tracking, and 3D point cloud segmentation. It also presents comparative results on several publicly available datasets, together with insightful observations and inspiring future research directions.},
  archiveprefix = {arXiv},
  langid        = {english},
  keywords      = {Computer Science - Computer Vision and Pattern Recognition,Computer Science - Machine Learning,Computer Science - Robotics,Electrical Engineering and Systems Science - Image and Video Processing},
  file          = {C:\Users\Dustella\Zotero\storage\CSB2ZYNP\Guo et al. - 2020 - Deep Learning for 3D Point Clouds A Survey.pdf}
}

@misc{huRandLANetEfficientSemantic2020,
  title         = {{{RandLA-Net}}: {{Efficient Semantic Segmentation}} of {{Large-Scale Point Clouds}}},
  shorttitle    = {{{RandLA-Net}}},
  author        = {Hu, Qingyong and Yang, Bo and Xie, Linhai and Rosa, Stefano and Guo, Yulan and Wang, Zhihua and Trigoni, Niki and Markham, Andrew},
  year          = {2020},
  month         = may,
  number        = {arXiv:1911.11236},
  eprint        = {1911.11236},
  primaryclass  = {cs},
  publisher     = {arXiv},
  urldate       = {2024-11-11},
  abstract      = {We study the problem of efficient semantic segmentation for large-scale 3D point clouds. By relying on expensive sampling techniques or computationally heavy pre/postprocessing steps, most existing approaches are only able to be trained and operate over small-scale point clouds. In this paper, we introduce RandLA-Net, an efficient and lightweight neural architecture to directly infer per-point semantics for large-scale point clouds. The key to our approach is to use random point sampling instead of more complex point selection approaches. Although remarkably computation and memory efficient, random sampling can discard key features by chance. To overcome this, we introduce a novel local feature aggregation module to progressively increase the receptive field for each 3D point, thereby effectively preserving geometric details. Extensive experiments show that our RandLA-Net can process 1 million points in a single pass with up to 200{\texttimes} faster than existing approaches. Moreover, our RandLA-Net clearly surpasses state-of-the-art approaches for semantic segmentation on two large-scale benchmarks Semantic3D and SemanticKITTI.},
  archiveprefix = {arXiv},
  langid        = {english},
  keywords      = {Computer Science - Computer Vision and Pattern Recognition,Computer Science - Machine Learning,Electrical Engineering and Systems Science - Image and Video Processing},
  file          = {C:\Users\Dustella\Zotero\storage\ZSDB4DWZ\Hu et al. - 2020 - RandLA-Net Efficient Semantic Segmentation of Large-Scale Point Clouds.pdf}
}

@misc{laiSphericalTransformerLiDARbased2023,
  title         = {Spherical {{Transformer}} for {{LiDAR-based 3D Recognition}}},
  author        = {Lai, Xin and Chen, Yukang and Lu, Fanbin and Liu, Jianhui and Jia, Jiaya},
  year          = {2023},
  month         = mar,
  number        = {arXiv:2303.12766},
  eprint        = {2303.12766},
  primaryclass  = {cs},
  publisher     = {arXiv},
  urldate       = {2024-11-18},
  abstract      = {LiDAR-based 3D point cloud recognition has benefited various applications. Without specially considering the LiDAR point distribution, most current methods suffer from information disconnection and limited receptive field, especially for the sparse distant points. In this work, we study the varying-sparsity distribution of LiDAR points and present SphereFormer to directly aggregate information from dense close points to the sparse distant ones. We design radial window self-attention that partitions the space into multiple non-overlapping narrow and long windows. It overcomes the disconnection issue and enlarges the receptive field smoothly and dramatically, which significantly boosts the performance of sparse distant points. Moreover, to fit the narrow and long windows, we propose exponential splitting to yield fine-grained position encoding and dynamic feature selection to increase model representation ability. Notably, our method ranks 1st on both nuScenes and SemanticKITTI semantic segmentation benchmarks with 81.9\% and 74.8\% mIoU, respectively. Also, we achieve the 3rd place on nuScenes object detection benchmark with 72.8\% NDS and 68.5\% mAP. Code is available at https: // github.com/ dvlab-research/ SphereFormer.git.},
  archiveprefix = {arXiv},
  langid        = {english},
  keywords      = {Computer Science - Artificial Intelligence,Computer Science - Computer Vision and Pattern Recognition},
  file          = {C:\Users\Dustella\Zotero\storage\NNCDXGL9\Lai et al. - 2023 - Spherical Transformer for LiDAR-based 3D Recognition.pdf}
}

@misc{liRAPiDSegRangeAwarePointwise2024,
  title         = {{{RAPiD-Seg}}: {{Range-Aware Pointwise Distance Distribution Networks}} for {{3D LiDAR Segmentation}}},
  shorttitle    = {{{RAPiD-Seg}}},
  author        = {Li, Li and Shum, Hubert P. H. and Breckon, Toby P.},
  year          = {2024},
  month         = sep,
  number        = {arXiv:2407.10159},
  eprint        = {2407.10159},
  primaryclass  = {cs},
  publisher     = {arXiv},
  urldate       = {2024-10-24},
  abstract      = {3D point clouds play a pivotal role in outdoor scene perception, especially in the context of autonomous driving. Recent advancements in 3D LiDAR segmentation often focus intensely on the spatial positioning and distribution of points for accurate segmentation. However, these methods, while robust in variable conditions, encounter challenges due to sole reliance on coordinates and point intensity, leading to poor isometric invariance and suboptimal segmentation. To tackle this challenge, our work introduces Range-Aware Pointwise Distance Distribution (RAPiD) features and the associated RAPiD-Seg architecture. Our RAPiD features exhibit rigid transformation invariance and effectively adapt to variations in point density, with a design focus on capturing the localized geometry of neighboring structures. They utilize inherent LiDAR isotropic radiation and semantic categorization for enhanced local representation and computational efficiency, while incorporating a 4D distance metric that integrates geometric and surface material reflectivity for improved semantic segmentation. To effectively embed high-dimensional RAPiD features, we propose a double-nested autoencoder structure with a novel class-aware embedding objective to encode high-dimensional features into manageable voxel-wise embeddings. Additionally, we propose RAPiD-Seg which incorporates a channel-wise attention fusion and two effective RAPiD-Seg variants, further optimizing the embedding for enhanced performance and generalization. Our method outperforms contemporary LiDAR segmentation work in terms of mIoU on SemanticKITTI (76.1) and nuScenes (83.6) datasets.},
  archiveprefix = {arXiv},
  langid        = {english},
  keywords      = {Computer Science - Computer Vision and Pattern Recognition,Computer Science - Machine Learning,Computer Science - Robotics},
  file          = {C:\Users\Dustella\Zotero\storage\NB5XSCWU\Li et al. - 2024 - RAPiD-Seg Range-Aware Pointwise Distance Distribution Networks for 3D LiDAR Segmentation.pdf}
}

@misc{PapersCodePointNet,
  title        = {Papers with {{Code}} - {{PointNet}}: {{Deep Learning}} on {{Point Sets}} for {{3D Classification}} and {{Segmentation}}},
  shorttitle   = {Papers with {{Code}} - {{PointNet}}},
  urldate      = {2024-10-25},
  abstract     = {🏆 SOTA for Semantic Segmentation on S3DIS (Number of params metric)},
  howpublished = {https://paperswithcode.com/paper/pointnet-deep-learning-on-point-sets-for-3d},
  langid       = {english},
  file         = {C:\Users\Dustella\Zotero\storage\GTJ8KDKL\pointnet-deep-learning-on-point-sets-for-3d.html}
}

@misc{parkRethinkingDataAugmentation2024,
  title         = {Rethinking {{Data Augmentation}} for {{Robust LiDAR Semantic Segmentation}} in {{Adverse Weather}}},
  author        = {Park, Junsung and Kim, Kyungmin and Shim, Hyunjung},
  year          = {2024},
  month         = jul,
  number        = {arXiv:2407.02286},
  eprint        = {2407.02286},
  primaryclass  = {cs},
  publisher     = {arXiv},
  urldate       = {2024-10-24},
  abstract      = {Existing LiDAR semantic segmentation methods often struggle with performance declines in adverse weather conditions. Previous research has addressed this issue by simulating adverse weather or employing universal data augmentation during training. However, these methods lack a detailed analysis and understanding of how adverse weather negatively affects LiDAR semantic segmentation performance. Motivated by this issue, we identified key factors of adverse weather and conducted a toy experiment to pinpoint the main causes of performance degradation: (1) Geometric perturbation due to refraction caused by fog or droplets in the air and (2) Point drop due to energy absorption and occlusions. Based on these findings, we propose new strategic data augmentation techniques. First, we introduced a Selective Jittering (SJ) that jitters points in the random range of depth (or angle) to mimic geometric perturbation. Additionally, we developed a Learnable Point Drop (LPD) to learn vulnerable erase patterns with Deep Q-Learning Network to approximate the point drop phenomenon from adverse weather conditions. Without precise weather simulation, these techniques strengthen the LiDAR semantic segmentation model by exposing it to vulnerable conditions identified by our data-centric analysis. Experimental results confirmed the suitability of the proposed data augmentation methods for enhancing robustness against adverse weather conditions. Our method attains a remarkable 39.5 mIoU on the SemanticKITTI-to-SemanticSTF benchmark, surpassing the previous state-of-the-art by over 5.4\%p, tripling the improvement over the baseline compared to previous methods achieved.},
  archiveprefix = {arXiv},
  langid        = {english},
  keywords      = {Computer Science - Artificial Intelligence,Computer Science - Computer Vision and Pattern Recognition},
  file          = {C:\Users\Dustella\Zotero\storage\GLAUMJCL\Park et al. - 2024 - Rethinking Data Augmentation for Robust LiDAR Semantic Segmentation in Adverse Weather.pdf}
}

@misc{qiPointNetDeepLearning2017,
  title         = {{{PointNet}}: {{Deep Learning}} on {{Point Sets}} for {{3D Classification}} and {{Segmentation}}},
  shorttitle    = {{{PointNet}}},
  author        = {Qi, Charles R. and Su, Hao and Mo, Kaichun and Guibas, Leonidas J.},
  year          = {2017},
  month         = apr,
  number        = {arXiv:1612.00593},
  eprint        = {1612.00593},
  primaryclass  = {cs},
  publisher     = {arXiv},
  urldate       = {2024-10-25},
  abstract      = {Point cloud is an important type of geometric data structure. Due to its irregular format, most researchers transform such data to regular 3D voxel grids or collections of images. This, however, renders data unnecessarily voluminous and causes issues. In this paper, we design a novel type of neural network that directly consumes point clouds, which well respects the permutation invariance of points in the input. Our network, named PointNet, provides a unified architecture for applications ranging from object classification, part segmentation, to scene semantic parsing. Though simple, PointNet is highly efficient and effective. Empirically, it shows strong performance on par or even better than state of the art. Theoretically, we provide analysis towards understanding of what the network has learnt and why the network is robust with respect to input perturbation and corruption.},
  archiveprefix = {arXiv},
  langid        = {english},
  keywords      = {Computer Science - Computer Vision and Pattern Recognition},
  file          = {C:\Users\Dustella\Zotero\storage\SEMCPFVH\Qi et al. - 2017 - PointNet Deep Learning on Point Sets for 3D Classification and Segmentation.pdf}
}

@misc{schmidtLiDARViewSynthesis2023,
  title         = {{{LiDAR View Synthesis}} for {{Robust Vehicle Navigation Without Expert Labels}}},
  author        = {Schmidt, Jonathan and Khan, Qadeer and Cremers, Daniel},
  year          = {2023},
  month         = aug,
  number        = {arXiv:2308.01424},
  eprint        = {2308.01424},
  primaryclass  = {cs},
  publisher     = {arXiv},
  urldate       = {2024-10-24},
  abstract      = {Deep learning models for self-driving cars require a diverse training dataset to manage critical driving scenarios on public roads safely. This includes having data from divergent trajectories, such as the oncoming traffic lane or sidewalks. Such data would be too dangerous to collect in the real world. Data augmentation approaches have been proposed to tackle this issue using RGB images. However, solutions based on LiDAR sensors are scarce. Therefore, we propose synthesizing additional LiDAR point clouds from novel viewpoints without physically driving at dangerous positions. The LiDAR view synthesis is done using mesh reconstruction and ray casting. We train a deep learning model, which takes a LiDAR scan as input and predicts the future trajectory as output. A waypoint controller is then applied to this predicted trajectory to determine the throttle and steering labels of the ego-vehicle. Our method neither requires expert driving labels for the original nor the synthesized LiDAR sequence. Instead, we infer labels from LiDAR odometry. We demonstrate the effectiveness of our approach in a comprehensive online evaluation and with a comparison to concurrent work. Our results show the importance of synthesizing additional LiDAR point clouds, particularly in terms of model robustness. Project page: https://jonathsch.github.io/lidar-synthesis/},
  archiveprefix = {arXiv},
  langid        = {english},
  keywords      = {Computer Science - Computer Vision and Pattern Recognition},
  file          = {C:\Users\Dustella\Zotero\storage\YTVNTFNG\Schmidt et al. - 2023 - LiDAR View Synthesis for Robust Vehicle Navigation Without Expert Labels.pdf}
}

@misc{thomasKPConvFlexibleDeformable2019,
  title         = {{{KPConv}}: {{Flexible}} and {{Deformable Convolution}} for {{Point Clouds}}},
  shorttitle    = {{{KPConv}}},
  author        = {Thomas, Hugues and Qi, Charles R. and Deschaud, Jean-Emmanuel and Marcotegui, Beatriz and Goulette, Fran{\c c}ois and Guibas, Leonidas J.},
  year          = {2019},
  month         = aug,
  number        = {arXiv:1904.08889},
  eprint        = {1904.08889},
  publisher     = {arXiv},
  doi           = {10.48550/arXiv.1904.08889},
  urldate       = {2024-11-11},
  abstract      = {We present Kernel Point Convolution (KPConv), a new design of point convolution, i.e. that operates on point clouds without any intermediate representation. The convolution weights of KPConv are located in Euclidean space by kernel points, and applied to the input points close to them. Its capacity to use any number of kernel points gives KPConv more flexibility than fixed grid convolutions. Furthermore, these locations are continuous in space and can be learned by the network. Therefore, KPConv can be extended to deformable convolutions that learn to adapt kernel points to local geometry. Thanks to a regular subsampling strategy, KPConv is also efficient and robust to varying densities. Whether they use deformable KPConv for complex tasks, or rigid KPconv for simpler tasks, our networks outperform state-of-the-art classification and segmentation approaches on several datasets. We also offer ablation studies and visualizations to provide understanding of what has been learned by KPConv and to validate the descriptive power of deformable KPConv.},
  archiveprefix = {arXiv},
  keywords      = {Computer Science - Computer Vision and Pattern Recognition},
  file          = {C\:\\Users\\Dustella\\Zotero\\storage\\GFTWYHLA\\Thomas et al. - 2019 - KPConv Flexible and Deformable Convolution for Point Clouds.pdf;C\:\\Users\\Dustella\\Zotero\\storage\\99L2NSWF\\1904.html}
}

@misc{wangSegNet4DEffectiveEfficient2024,
  title         = {{{SegNet4D}}: {{Effective}} and {{Efficient 4D LiDAR Semantic Segmentation}} in {{Autonomous Driving Environments}}},
  shorttitle    = {{{SegNet4D}}},
  author        = {Wang, Neng and Guo, Ruibin and Shi, Chenghao and Zhang, Hui and Lu, Huimin and Zheng, Zhiqiang and Chen, Xieyuanli},
  year          = {2024},
  month         = jun,
  number        = {arXiv:2406.16279},
  eprint        = {2406.16279},
  primaryclass  = {cs},
  publisher     = {arXiv},
  urldate       = {2024-10-24},
  abstract      = {4D LiDAR semantic segmentation, also referred to as multi-scan semantic segmentation, plays a crucial role in enhancing the environmental understanding capabilities of autonomous vehicles. It entails identifying the semantic category of each point in the LiDAR scan and distinguishing whether it is dynamic, a critical aspect in downstream tasks such as path planning and autonomous navigation. Existing methods for 4D semantic segmentation often rely on computationally intensive 4D convolutions for multi-scan input, resulting in poor real-time performance. In this article, we introduce SegNet4D, a novel real-time multi-scan semantic segmentation method leveraging a projection-based approach for fast motion feature encoding, showcasing outstanding performance. SegNet4D treats 4D semantic segmentation as two distinct tasks: single-scan semantic segmentation and moving object segmentation, each addressed by dedicated head. These results are then fused in the proposed motion-semantic fusion module to achieve comprehensive multi-scan semantic segmentation. Besides, we propose extracting instance information from the current scan and incorporating it into the network for instance-aware segmentation. Our approach exhibits state-of-the-art performance across multiple datasets and stands out as a real-time multi-scan semantic segmentation method. The implementation of SegNet4D will be made available at {\textbackslash}url\{https://github.com/nubot-nudt/SegNet4D\}.},
  archiveprefix = {arXiv},
  langid        = {english},
  keywords      = {Computer Science - Computer Vision and Pattern Recognition},
  file          = {C:\Users\Dustella\Zotero\storage\W87LWRFV\Wang et al. - 2024 - SegNet4D Effective and Efficient 4D LiDAR Semantic Segmentation in Autonomous Driving Environments.pdf}
}

@misc{wuPointTransformerV22022,
  title         = {Point {{Transformer V2}}: {{Grouped Vector Attention}} and {{Partition-based Pooling}}},
  shorttitle    = {Point {{Transformer V2}}},
  author        = {Wu, Xiaoyang and Lao, Yixing and Jiang, Li and Liu, Xihui and Zhao, Hengshuang},
  year          = {2022},
  month         = oct,
  number        = {arXiv:2210.05666},
  eprint        = {2210.05666},
  primaryclass  = {cs},
  publisher     = {arXiv},
  doi           = {10.48550/arXiv.2210.05666},
  urldate       = {2025-03-04},
  abstract      = {As a pioneering work exploring transformer architecture for 3D point cloud understanding, Point Transformer achieves impressive results on multiple highly competitive benchmarks. In this work, we analyze the limitations of the Point Transformer and propose our powerful and efficient Point Transformer V2 model with novel designs that overcome the limitations of previous work. In particular, we first propose group vector attention, which is more effective than the previous version of vector attention. Inheriting the advantages of both learnable weight encoding and multi-head attention, we present a highly effective implementation of grouped vector attention with a novel grouped weight encoding layer. We also strengthen the position information for attention by an additional position encoding multiplier. Furthermore, we design novel and lightweight partition-based pooling methods which enable better spatial alignment and more efficient sampling. Extensive experiments show that our model achieves better performance than its predecessor and achieves state-of-the-art on several challenging 3D point cloud understanding benchmarks, including 3D point cloud segmentation on ScanNet v2 and S3DIS and 3D point cloud classification on ModelNet40. Our code will be available at https://github.com/Gofinge/PointTransformerV2.},
  archiveprefix = {arXiv},
  keywords      = {Computer Science - Computer Vision and Pattern Recognition},
  file          = {C:\Users\Dustella\Zotero\storage\4MVHL7TR\Wu et al. - 2022 - Point Transformer V2 Grouped Vector Attention and Partition-based Pooling.pdf}
}

@misc{wuPointTransformerV32024,
  title         = {Point {{Transformer V3}}: {{Simpler}}, {{Faster}}, {{Stronger}}},
  shorttitle    = {Point {{Transformer V3}}},
  author        = {Wu, Xiaoyang and Jiang, Li and Wang, Peng-Shuai and Liu, Zhijian and Liu, Xihui and Qiao, Yu and Ouyang, Wanli and He, Tong and Zhao, Hengshuang},
  year          = {2024},
  month         = mar,
  number        = {arXiv:2312.10035},
  eprint        = {2312.10035},
  publisher     = {arXiv},
  doi           = {10.48550/arXiv.2312.10035},
  urldate       = {2024-11-25},
  abstract      = {This paper is not motivated to seek innovation within the attention mechanism. Instead, it focuses on overcoming the existing trade-offs between accuracy and efficiency within the context of point cloud processing, leveraging the power of scale. Drawing inspiration from recent advances in 3D large-scale representation learning, we recognize that model performance is more influenced by scale than by intricate design. Therefore, we present Point Transformer V3 (PTv3), which prioritizes simplicity and efficiency over the accuracy of certain mechanisms that are minor to the overall performance after scaling, such as replacing the precise neighbor search by KNN with an efficient serialized neighbor mapping of point clouds organized with specific patterns. This principle enables significant scaling, expanding the receptive field from 16 to 1024 points while remaining efficient (a 3x increase in processing speed and a 10x improvement in memory efficiency compared with its predecessor, PTv2). PTv3 attains state-of-the-art results on over 20 downstream tasks that span both indoor and outdoor scenarios. Further enhanced with multi-dataset joint training, PTv3 pushes these results to a higher level.},
  archiveprefix = {arXiv},
  keywords      = {Computer Science - Computer Vision and Pattern Recognition},
  file          = {C\:\\Users\\Dustella\\Zotero\\storage\\XYSQFHA7\\Wu et al. - 2024 - Point Transformer V3 Simpler, Faster, Stronger.pdf;C\:\\Users\\Dustella\\Zotero\\storage\\FPH7SMHI\\2312.html}
}

@misc{xieAnnotatorGenericActive2023,
  title         = {Annotator: {{A Generic Active Learning Baseline}} for {{LiDAR Semantic Segmentation}}},
  shorttitle    = {Annotator},
  author        = {Xie, Binhui and Li, Shuang and Guo, Qingju and Liu, Chi Harold and Cheng, Xinjing},
  year          = {2023},
  month         = oct,
  number        = {arXiv:2310.20293},
  eprint        = {2310.20293},
  primaryclass  = {cs},
  publisher     = {arXiv},
  urldate       = {2024-10-24},
  abstract      = {Active learning, a label-efficient paradigm, empowers models to interactively query an oracle for labeling new data. In the realm of LiDAR semantic segmentation, the challenges stem from the sheer volume of point clouds, rendering annotation labor-intensive and cost-prohibitive. This paper presents Annotator, a general and efficient active learning baseline, in which a voxel-centric online selection strategy is tailored to efficiently probe and annotate the salient and exemplar voxel girds within each LiDAR scan, even under distribution shift. Concretely, we first execute an in-depth analysis of several common selection strategies such as Random, Entropy, Margin, and then develop voxel confusion degree (VCD) to exploit the local topology relations and structures of point clouds. Annotator excels in diverse settings, with a particular focus on active learning (AL), active source-free domain adaptation (ASFDA), and active domain adaptation (ADA). It consistently delivers exceptional performance across LiDAR semantic segmentation benchmarks, spanning both simulation-to-real and real-to-real scenarios. Surprisingly, Annotator exhibits remarkable efficiency, requiring significantly fewer annotations, e.g., just labeling five voxels per scan in the SynLiDAR {$\rightarrow$} SemanticKITTI task. This results in impressive performance, achieving 87.8\% fully-supervised performance under AL, 88.5\% under ASFDA, and 94.4\% under ADA. We envision that Annotator will offer a simple, general, and efficient solution for label-efficient 3D applications.},
  archiveprefix = {arXiv},
  langid        = {english},
  keywords      = {Computer Science - Computer Vision and Pattern Recognition},
  file          = {C:\Users\Dustella\Zotero\storage\R93MHYZQ\Xie et al. - 2023 - Annotator A Generic Active Learning Baseline for LiDAR Semantic Segmentation.pdf}
}

@article{zengSelfsupervisedLearningPoint2024,
  title      = {Self-Supervised Learning for Point Cloud Data: {{A}} Survey},
  shorttitle = {Self-Supervised Learning for Point Cloud Data},
  author     = {Zeng, Changyu and Wang, Wei and Nguyen, Anh and Xiao, Jimin and Yue, Yutao},
  year       = {2024},
  month      = mar,
  journal    = {Expert Systems with Applications},
  volume     = {237},
  pages      = {121354},
  issn       = {0957-4174},
  doi        = {10.1016/j.eswa.2023.121354},
  urldate    = {2024-11-25},
  abstract   = {3D point clouds are a crucial type of data collected by LiDAR sensors and widely used in transportation applications due to its concise descriptions and accurate localization. Deep neural networks (DNNs) have achieved remarkable success in processing large amount of disordered and sparse 3D point clouds, especially in various computer vision tasks, such as pedestrian detection and vehicle recognition. Among all the learning paradigms, Self-Supervised Learning (SSL), an unsupervised training paradigm that mines effective information from the data itself, is considered as an essential solution to solve the time-consuming and labor-intensive data labeling problems via smart pre-training task design. This paper provides a comprehensive survey of recent advances on SSL for point clouds. We first present an innovative taxonomy, categorizing the existing SSL methods into four broad categories based on the pretexts' characteristics. Under each category, we then further categorize the methods into more fine-grained groups and summarize the strength and limitations of the representative methods. We also compare the performance of the notable SSL methods in literature on multiple downstream tasks on benchmark datasets both quantitatively and qualitatively. Finally, we propose a number of future research directions based on the identified limitations of existing SSL research on point clouds.},
  keywords   = {Computer vision,Point clouds,Pretext task,Representation learning,Self-supervised learning,Transfer learning},
  file       = {C\:\\Users\\Dustella\\Zotero\\storage\\UEQK3BPY\\Zeng et al. - 2024 - Self-supervised learning for point cloud data A survey.pdf;C\:\\Users\\Dustella\\Zotero\\storage\\RXGEIFDA\\S0957417423018560.html}
}

@misc{zhangApproachingOutsideScaling2024,
  title         = {Approaching {{Outside}}: {{Scaling Unsupervised 3D Object Detection}} from {{2D Scene}}},
  shorttitle    = {Approaching {{Outside}}},
  author        = {Zhang, Ruiyang and Zhang, Hu and Yu, Hang and Zheng, Zhedong},
  year          = {2024},
  month         = jul,
  number        = {arXiv:2407.08569},
  eprint        = {2407.08569},
  primaryclass  = {cs},
  publisher     = {arXiv},
  urldate       = {2024-10-24},
  abstract      = {The unsupervised 3D object detection is to accurately detect objects in unstructured environments with no explicit supervisory signals. This task, given sparse LiDAR point clouds, often results in compromised performance for detecting distant or small objects due to the inherent sparsity and limited spatial resolution. In this paper, we are among the early attempts to integrate LiDAR data with 2D images for unsupervised 3D detection and introduce a new method, dubbed LiDAR2D Self-paced Learning (LiSe). We argue that RGB images serve as a valuable complement to LiDAR data, offering precise 2D localization cues, particularly when scarce LiDAR points are available for certain objects. Considering the unique characteristics of both modalities, our framework devises a self-paced learning pipeline that incorporates adaptive sampling and weak model aggregation strategies. The adaptive sampling strategy dynamically tunes the distribution of pseudo labels during training, countering the tendency of models to overfit easily detected samples, such as nearby and large-sized objects. By doing so, it ensures a balanced learning trajectory across varying object scales and distances. The weak model aggregation component consolidates the strengths of models trained under different pseudo label distributions, culminating in a robust and powerful final model. Experimental evaluations validate the efficacy of our proposed LiSe method, manifesting significant improvements of +7.1\% APBEV and +3.4\% AP3D on nuScenes, and +8.3\% APBEV and +7.4\% AP3D on Lyft compared to existing techniques.},
  archiveprefix = {arXiv},
  langid        = {english},
  keywords      = {Computer Science - Computer Vision and Pattern Recognition},
  file          = {C:\Users\Dustella\Zotero\storage\V4KC9KL6\Zhang et al. - 2024 - Approaching Outside Scaling Unsupervised 3D Object Detection from 2D Scene.pdf}
}

@article{zhangDeepLearningbased3D2023,
  title      = {Deep Learning-Based {{3D}} Point Cloud Classification: {{A}} Systematic Survey and Outlook},
  shorttitle = {Deep Learning-Based {{3D}} Point Cloud Classification},
  author     = {Zhang, Huang and Wang, Changshuo and Tian, Shengwei and Lu, Baoli and Zhang, Liping and Ning, Xin and Bai, Xiao},
  year       = {2023},
  month      = sep,
  journal    = {Displays},
  volume     = {79},
  pages      = {102456},
  issn       = {0141-9382},
  doi        = {10.1016/j.displa.2023.102456},
  urldate    = {2024-11-25},
  abstract   = {In recent years, point cloud representation has become one of the research hotspots in the field of computer vision, and has been widely used in many fields, such as autonomous driving, virtual reality, robotics, etc. Although deep learning techniques have achieved great success in processing regular structured 2D grid image data, there are still great challenges in processing irregular, unstructured point cloud data. Point cloud classification is the basis of point cloud analysis, and many deep learning-based methods have been widely used in this task. Therefore, the purpose of this paper is to provide researchers in this field with the latest research progress and future trends. First, we introduce point cloud acquisition, characteristics, and challenges. Second, we review 3D data representations, storage formats, and commonly used datasets for point cloud classification. We then summarize deep learning-based methods for point cloud classification and complement recent research work. Next, we compare and analyze the performance of the main methods. Finally, we discuss some challenges and future directions for point cloud classification.},
  langid     = {american},
  keywords   = {3D data,Classification,Deep learning,Point cloud},
  file       = {C\:\\Users\\Dustella\\Zotero\\storage\\FCMEI7NV\\Zhang et al. - 2023 - Deep learning-based 3D point cloud classification A systematic survey and outlook.pdf;C\:\\Users\\Dustella\\Zotero\\storage\\9427LLCF\\S0141938223000896.html}
}

@misc{zhangDetectingAnomaliesLiDAR2023,
  title         = {Detecting the {{Anomalies}} in {{LiDAR Pointcloud}}},
  author        = {Zhang, Chiyu and Han, Ji and Zou, Yao and Dong, Kexin and Li, Yujia and Ding, Junchun and Han, Xiaoling},
  year          = {2023},
  month         = jul,
  number        = {arXiv:2308.00187},
  eprint        = {2308.00187},
  primaryclass  = {cs},
  publisher     = {arXiv},
  urldate       = {2024-10-24},
  abstract      = {LiDAR sensors play an important role in the perception stack of modern autonomous driving systems. Adverse weather conditions such as rain, fog and dust, as well as some (occasional) LiDAR hardware fault may cause the LiDAR to produce pointcloud with abnormal patterns such as scattered noise points and uncommon intensity values. In this paper, we propose a novel approach to detect whether a LiDAR is generating anomalous pointcloud by analyzing the pointcloud characteristics. Specifically, we develop a pointcloud quality metric based on the LiDAR points' spatial and intensity distribution to characterize the noise level of the pointcloud, which relies on pure mathematical analysis and does not require any labeling or training as learning-based methods do. Therefore, the method is scalable and can be quickly deployed either online to improve the autonomy safety by monitoring anomalies in the LiDAR data or offline to perform in-depth study of the LiDAR behavior over large amount of data. The proposed approach is studied with extensive real public road data collected by LiDARs with different scanning mechanisms and laser spectrums, and is proven to be able to effectively handle various known and unknown sources of pointcloud anomaly.},
  archiveprefix = {arXiv},
  langid        = {english},
  keywords      = {Computer Science - Computer Vision and Pattern Recognition,Computer Science - Robotics,Electrical Engineering and Systems Science - Signal Processing},
  file          = {C:\Users\Dustella\Zotero\storage\9VXKS8XI\Zhang et al. - 2023 - Detecting the Anomalies in LiDAR Pointcloud.pdf}
}

@misc{zhangGSMatchingReconsideringFeature2024,
  title         = {{{GS-Matching}}: {{Reconsidering Feature Matching}} Task in {{Point Cloud Registration}}},
  shorttitle    = {{{GS-Matching}}},
  author        = {Zhang, Yaojie and Huang, Tianlun and Wang, Weijun and Feng, Wei},
  year          = {2024},
  month         = dec,
  number        = {arXiv:2412.04855},
  eprint        = {2412.04855},
  primaryclass  = {cs},
  publisher     = {arXiv},
  doi           = {10.48550/arXiv.2412.04855},
  urldate       = {2024-12-18},
  abstract      = {Traditional point cloud registration (PCR) methods for feature matching often employ the nearest neighbor policy. This leads to many-to-one matches and numerous potential inliers without any corresponding point. Recently, some approaches have framed the feature matching task as an assignment problem to achieve optimal one-to-one matches. We argue that the transition to the Assignment problem is not reliable for general correspondence-based PCR. In this paper, we propose a heuristics stable matching policy called GS-matching, inspired by the Gale-Shapley algorithm. Compared to the other matching policies, our method can perform efficiently and find more non-repetitive inliers under low overlapping conditions. Furthermore, we employ the probability theory to analyze the feature matching task, providing new insights into this research problem. Extensive experiments validate the effectiveness of our matching policy, achieving better registration recall on multiple datasets.},
  archiveprefix = {arXiv},
  langid        = {english},
  keywords      = {Computer Science - Computer Vision and Pattern Recognition},
  file          = {C:\Users\Dustella\Zotero\storage\6KUHQ38Q\Zhang et al. - 2024 - GS-Matching Reconsidering Feature Matching task in Point Cloud Registration.pdf}
}

@inproceedings{zhangPolarNetImprovedGrid2020,
  title      = {{{PolarNet}}: {{An Improved Grid Representation}} for {{Online LiDAR Point Clouds Semantic Segmentation}}},
  shorttitle = {{{PolarNet}}},
  booktitle  = {2020 {{IEEE}}/{{CVF Conference}} on {{Computer Vision}} and {{Pattern Recognition}} ({{CVPR}})},
  author     = {Zhang, Yang and Zhou, Zixiang and David, Philip and Yue, Xiangyu and Xi, Zerong and Gong, Boqing and Foroosh, Hassan},
  year       = {2020},
  month      = jun,
  pages      = {9598--9607},
  publisher  = {IEEE},
  address    = {Seattle, WA, USA},
  doi        = {10.1109/CVPR42600.2020.00962},
  urldate    = {2024-10-25},
  abstract   = {The need for fine-grained perception in autonomous driving systems has resulted in recently increased research on online semantic segmentation of single-scan LiDAR. Despite the emerging datasets and technological advancements, it remains challenging due to three reasons: (1) the need for near-real-time latency with limited hardware; (2) uneven or even long-tailed distribution of LiDAR points across space; and (3) an increasing number of extremely fine-grained semantic classes. In an attempt to jointly tackle all the aforementioned challenges, we propose a new LiDAR-specific, nearest-neighbor-free segmentation algorithm --- PolarNet. Instead of using common spherical or bird's-eye-view projection, our polar bird's-eye-view representation balances the points across grid cells in a polar coordinate system, indirectly aligning a segmentation network's attention with the long-tailed distribution of the points along the radial axis. We find that our encoding scheme greatly increases the mIoU in three drastically different segmentation datasets of real urban LiDAR single scans while retaining near real-time throughput.},
  copyright  = {https://ieeexplore.ieee.org/Xplorehelp/downloads/license-information/IEEE.html},
  isbn       = {978-1-7281-7168-5},
  langid     = {english},
  file       = {C:\Users\Dustella\Zotero\storage\M6KA53MU\Zhang et al. - 2020 - PolarNet An Improved Grid Representation for Online LiDAR Point Clouds Semantic Segmentation.pdf}
}

@misc{zhangStreetGaussians3D2024,
  title         = {Street {{Gaussians}} without {{3D Object Tracker}}},
  author        = {Zhang, Ruida and Li, Chengxi and Zhang, Chenyangguang and Liu, Xingyu and Yuan, Haili and Li, Yanyan and Ji, Xiangyang and Lee, Gim Hee},
  year          = {2024},
  month         = dec,
  number        = {arXiv:2412.05548},
  eprint        = {2412.05548},
  primaryclass  = {cs},
  publisher     = {arXiv},
  doi           = {10.48550/arXiv.2412.05548},
  urldate       = {2024-12-18},
  abstract      = {Realistic scene reconstruction in driving scenarios poses significant challenges due to fast-moving objects. Most existing methods rely on labor-intensive manual labeling of object poses to reconstruct dynamic objects in canonical space and move them based on these poses during rendering. While some approaches attempt to use 3D object trackers to replace manual annotations, the limited generalization of 3D trackers -- caused by the scarcity of large-scale 3D datasets -- results in inferior reconstructions in real-world settings. In contrast, 2D foundation models demonstrate strong generalization capabilities. To eliminate the reliance on 3D trackers and enhance robustness across diverse environments, we propose a stable object tracking module by leveraging associations from 2D deep trackers within a 3D object fusion strategy. We address inevitable tracking errors by further introducing a motion learning strategy in an implicit feature space that autonomously corrects trajectory errors and recovers missed detections. Experimental results on Waymo-NOTR datasets show we achieve state-of-the-art performance. Our code will be made publicly available.},
  archiveprefix = {arXiv},
  langid        = {english},
  keywords      = {Computer Science - Computer Vision and Pattern Recognition},
  file          = {C:\Users\Dustella\Zotero\storage\SZ3XTHZS\Zhang et al. - 2024 - Street Gaussians without 3D Object Tracker.pdf}
}

@misc{zhaoPointTransformer2021,
  title         = {Point {{Transformer}}},
  author        = {Zhao, Hengshuang and Jiang, Li and Jia, Jiaya and Torr, Philip and Koltun, Vladlen},
  year          = {2021},
  month         = sep,
  number        = {arXiv:2012.09164},
  eprint        = {2012.09164},
  primaryclass  = {cs},
  publisher     = {arXiv},
  doi           = {10.48550/arXiv.2012.09164},
  urldate       = {2025-01-06},
  abstract      = {Self-attention networks have revolutionized natural language processing and are making impressive strides in image analysis tasks such as image classification and object detection. Inspired by this success, we investigate the application of self-attention networks to 3D point cloud processing. We design self-attention layers for point clouds and use these to construct self-attention networks for tasks such as semantic scene segmentation, object part segmentation, and object classification. Our Point Transformer design improves upon prior work across domains and tasks. For example, on the challenging S3DIS dataset for large-scale semantic scene segmentation, the Point Transformer attains an mIoU of 70.4\% on Area 5, outperforming the strongest prior model by 3.3 absolute percentage points and crossing the 70\% mIoU threshold for the first time.},
  archiveprefix = {arXiv},
  langid        = {american},
  keywords      = {Computer Science - Computer Vision and Pattern Recognition},
  file          = {C\:\\Users\\Dustella\\Zotero\\storage\\QCKE5JKN\\Zhao et al. - 2021 - Point Transformer.pdf;C\:\\Users\\Dustella\\Zotero\\storage\\X7ND7BWL\\2012.html}
}

@misc{zhengGaussianADGaussianCentricEndtoEnd2024,
  title         = {{{GaussianAD}}: {{Gaussian-Centric End-to-End Autonomous Driving}}},
  shorttitle    = {{{GaussianAD}}},
  author        = {Zheng, Wenzhao and Wu, Junjie and Zheng, Yao and Zuo, Sicheng and Xie, Zixun and Yang, Longchao and Pan, Yong and Hao, Zhihui and Jia, Peng and Lang, Xianpeng and Zhang, Shanghang},
  year          = {2024},
  month         = dec,
  number        = {arXiv:2412.10371},
  eprint        = {2412.10371},
  primaryclass  = {cs},
  publisher     = {arXiv},
  doi           = {10.48550/arXiv.2412.10371},
  urldate       = {2024-12-18},
  abstract      = {Vision-based autonomous driving shows great potential due to its satisfactory performance and low costs. Most existing methods adopt dense representations (e.g., bird's eye view) or sparse representations (e.g., instance boxes) for decision-making, which suffer from the trade-off between comprehensiveness and efficiency. This paper explores a Gaussian-centric end-to-end autonomous driving (GaussianAD) framework and exploits 3D semantic Gaussians to extensively yet sparsely describe the scene. We initialize the scene with uniform 3D Gaussians and use surroundingview images to progressively refine them to obtain the 3D Gaussian scene representation. We then use sparse convolutions to efficiently perform 3D perception (e.g., 3D detection, semantic map construction). We predict 3D flows for the Gaussians with dynamic semantics and plan the ego trajectory accordingly with an objective of future scene forecasting. Our GaussianAD can be trained in an endto-end manner with optional perception labels when available. Extensive experiments on the widely used nuScenes dataset verify the effectiveness of our end-to-end GaussianAD on various tasks including motion planning, 3D occupancy prediction, and 4D occupancy forecasting. Code: https://github.com/wzzheng/GaussianAD.},
  archiveprefix = {arXiv},
  langid        = {english},
  keywords      = {Computer Science - Artificial Intelligence,Computer Science - Computer Vision and Pattern Recognition,Computer Science - Machine Learning,Computer Science - Robotics},
  file          = {C:\Users\Dustella\Zotero\storage\G9T4HCVG\Zheng et al. - 2024 - GaussianAD Gaussian-Centric End-to-End Autonomous Driving.pdf}
}