This commit is contained in:
Dustella 2025-12-14 00:14:32 +08:00
parent 401ff54bbe
commit 120aa3ec3c
No known key found for this signature in database
GPG Key ID: C6227AE4A45E0187
3 changed files with 197 additions and 36 deletions

159
crack.bib Normal file
View File

@ -0,0 +1,159 @@
@inproceedings{benzOmniCrack30kBenchmarkCrack2024,
title = {{{OmniCrack30k}}: {{A Benchmark}} for {{Crack Segmentation}} and the {{Reasonable Effectiveness}} of {{Transfer Learning}}},
shorttitle = {{{OmniCrack30k}}},
booktitle = {Proceedings of the {{IEEE}}/{{CVF Conference}} on {{Computer Vision}} and {{Pattern Recognition}}},
author = {Benz, Christian and Rodehorst, Volker},
year = 2024,
pages = {3876--3886},
urldate = {2025-12-12},
langid = {english},
file = {C:\Users\Dustella\Zotero\storage\H7KXN4UU\Benz and Rodehorst - 2024 - OmniCrack30k A Benchmark for Crack Segmentation and the Reasonable Effectiveness of Transfer Learni.pdf}
}
@article{chenTransUNetRethinkingUNet2024,
title = {{{TransUNet}}: {{Rethinking}} the {{U-Net}} Architecture Design for Medical Image Segmentation through the Lens of Transformers},
shorttitle = {{{TransUNet}}},
author = {Chen, Jieneng and Mei, Jieru and Li, Xianhang and Lu, Yongyi and Yu, Qihang and Wei, Qingyue and Luo, Xiangde and Xie, Yutong and Adeli, Ehsan and Wang, Yan and Lungren, Matthew P. and Zhang, Shaoting and Xing, Lei and Lu, Le and Yuille, Alan and Zhou, Yuyin},
year = 2024,
month = oct,
journal = {Medical Image Analysis},
volume = {97},
pages = {103280},
issn = {1361-8415},
doi = {10.1016/j.media.2024.103280},
urldate = {2025-12-13},
abstract = {Medical image segmentation is crucial for healthcare, yet convolution-based methods like U-Net face limitations in modeling long-range dependencies. To address this, Transformers designed for sequence-to-sequence predictions have been integrated into medical image segmentation. However, a comprehensive understanding of Transformers' self-attention in U-Net components is lacking. TransUNet, first introduced in 2021, is widely recognized as one of the first models to integrate Transformer into medical image analysis. In this study, we present the versatile framework of TransUNet that encapsulates Transformers' self-attention into two key modules: (1) a Transformer encoder tokenizing image patches from a convolution neural network (CNN) feature map, facilitating global context extraction, and (2) a Transformer decoder refining candidate regions through cross-attention between proposals and U-Net features. These modules can be flexibly inserted into the U-Net backbone, resulting in three configurations: Encoder-only, Decoder-only, and Encoder+Decoder. TransUNet provides a library encompassing both 2D and 3D implementations, enabling users to easily tailor the chosen architecture. Our findings highlight the encoder's efficacy in modeling interactions among multiple abdominal organs and the decoder's strength in handling small targets like tumors. It excels in diverse medical applications, such as multi-organ segmentation, pancreatic tumor segmentation, and hepatic vessel segmentation. Notably, our TransUNet achieves a significant average Dice improvement of 1.06\% and 4.30\% for multi-organ segmentation and pancreatic tumor segmentation, respectively, when compared to the highly competitive nn-UNet, and surpasses the top-1 solution in the BrasTS2021 challenge. 2D/3D Code and models are available at https://github.com/Beckschen/TransUNet and https://github.com/Beckschen/TransUNet-3D, respectively.},
keywords = {Medical image segmentation,U-Net,Vision Transformers},
file = {C\:\\Users\\Dustella\\Zotero\\storage\\IYBIIGS8\\Chen et al. - 2024 - TransUNet Rethinking the U-Net architecture design for medical image segmentation through the lens.pdf;C\:\\Users\\Dustella\\Zotero\\storage\\42X2WBZY\\S1361841524002056.html}
}
@book{desjardinsProceedingsCanadianSociety2024,
title = {Proceedings of the {{Canadian Society}} for {{Civil Engineering Annual Conference}} 2023, {{Volume}} 4: {{Construction Track}}},
shorttitle = {Proceedings of the {{Canadian Society}} for {{Civil Engineering Annual Conference}} 2023, {{Volume}} 4},
author = {Desjardins, Serge and Poitras, G{\'e}rard J. and {Nik-Bakht}, Mazdak},
year = 2024,
month = sep,
publisher = {Springer Nature},
abstract = {This book comprises the proceedings of the Annual Conference of the Canadian Society for Civil Engineering 2023. The contents of this volume focus on the specialty track in construction with topics on modular and offsite construction, BIM, construction planning and project management, construction automation, AI and robotics in construction, sustainable construction, asset management, and construction safety, among others. This volume will prove a valuable resource for researchers and professionals.},
googlebooks = {dPYhEQAAQBAJ},
isbn = {978-3-031-61499-6},
langid = {english},
keywords = {Technology & Engineering / Civil / General,Technology & Engineering / Civil / Highway & Traffic,Technology & Engineering / Civil / Soil & Rock,Technology & Engineering / Construction / Heating Ventilation & Air Conditioning,Technology & Engineering / Environmental / General}
}
@article{liuDeepCrackDeepHierarchical2019,
title = {{{DeepCrack}}: {{A}} Deep Hierarchical Feature Learning Architecture for Crack Segmentation},
shorttitle = {{{DeepCrack}}},
author = {Liu, Yahui and Yao, Jian and Lu, Xiaohu and Xie, Renping and Li, Li},
year = 2019,
month = apr,
journal = {Neurocomputing},
volume = {338},
pages = {139--153},
issn = {0925-2312},
doi = {10.1016/j.neucom.2019.01.036},
urldate = {2025-12-13},
abstract = {Automatic crack detection from images of various scenes is a useful and challenging task in practice. In this paper, we propose a deep hierarchical convolutional neural network (CNN), called as DeepCrack, to predict pixel-wise crack segmentation in an end-to-end method. DeepCrack consists of the extended Fully Convolutional Networks (FCN) and the Deeply-Supervised Nets (DSN). During the training, the elaborately designed model learns and aggregates multi-scale and multi-level features from the low convolutional layers to the high-level convolutional layers, which is different from the standard approaches of only using the last convolutional layer. DSN provides integrated direct supervision for features of each convolutional stage. We apply both guided filtering and Conditional Random Fields (CRFs) methods to refine the final prediction results. A benchmark dataset consisting of 537 images with manual annotation maps are built to verify the effectiveness of our proposed method. Our method achieved state-of-the-art performances on the proposed dataset (mean I/U of 85.9, best F-score of 86.5, and 0.1~s per image).},
keywords = {Convolutional neural network,Crack detection,Crack detection dataset,Guided filtering,Hierarchical convolutional features,Semantic segmentation},
file = {C\:\\Users\\Dustella\\Zotero\\storage\\TRDSR58W\\Liu et al. - 2019 - DeepCrack A deep hierarchical feature learning architecture for crack segmentation.pdf;C\:\\Users\\Dustella\\Zotero\\storage\\MVB7SBUM\\S0925231219300566.html}
}
@misc{PDFFeaturePyramid,
title = { {{Feature Pyramid}} and {{Hierarchical Boosting Network}} for {{Pavement Crack Detection}}},
journal = {ResearchGate},
urldate = {2025-12-13},
abstract = {PDF \textbar{} Pavement crack detection is a critical task for insuring road safety. Manual crack detection is extremely time-consuming. Therefore, an automatic... \textbar{} Find, read and cite all the research you need on ResearchGate},
howpublished = {https://www.researchgate.net/publication/330244656\_Feature\_Pyramid\_and\_Hierarchical\_Boosting\_Network\_for\_Pavement\_Crack\_Detection},
langid = {english},
file = {C:\Users\Dustella\Zotero\storage\SESVZLX5\330244656_Feature_Pyramid_and_Hierarchical_Boosting_Network_for_Pavement_Crack_Detection.html}
}
@inproceedings{qiCrackSegMambaLightweightMamba2024,
title = {{{CrackSegMamba}}: {{A Lightweight Mamba Model}} for {{Crack Segmentation}}},
shorttitle = {{{CrackSegMamba}}},
booktitle = {2024 {{IEEE International Conference}} on {{Robotics}} and {{Biomimetics}} ({{ROBIO}})},
author = {Qi, Weiqing and Ma, Fulong and Zhao, Guoyang and Liu, Ming and Ma, Jun},
year = 2024,
month = dec,
pages = {601--607},
issn = {2994-3574},
doi = {10.1109/ROBIO64047.2024.10907574},
urldate = {2025-12-12},
abstract = {Crack localization and segmentation are essential for infrastructure maintenance and safety assessments, enabling timely repairs and preventing structural failures. Despite advancements in deep learning, crack segmentation remains challenging due to the need for real-time performance and computational efficiency. Existing methods often rely on large, resource-intensive models, limiting their practical deployment. We introduce CrackSegMamba, a novel model featuring Channel-wise Parallel Mamba (CPM) Modules, which achieves state-of-the-art performance with fewer than 0.23 million parameters and just 0.7 GFLOPs. CrackSegMamba reduces computational cost by 40-fold and parameter count by nearly 100-fold compared to existing models, while maintaining comparable accuracy. These features make CrackSegMamba ideal for real-time applications. Additionally, we present Crack20000, an annotated dataset of 20,000 concrete crack images to support further research and validation. Evaluations on the Crack500 [1] and Crack20000 datasets demonstrate that CrackSegMamba delivers comparable accuracy to leading methods, with significantly reduced computational requirements. Project page is available at: https://sites.google.com/view/cracksegmamba.},
langid = {american},
keywords = {Accuracy,Computational efficiency,Computational modeling,Location awareness,Maintenance,Maintenance engineering,Real-time systems,Robots,Robustness,Safety},
file = {C:\Users\Dustella\Zotero\storage\C5CSPLGW\Qi et al. - 2024 - CrackSegMamba A Lightweight Mamba Model for Crack Segmentation.pdf}
}
@misc{raviSAM2Segment2024,
title = {{{SAM}} 2: {{Segment Anything}} in {{Images}} and {{Videos}}},
shorttitle = {{{SAM}} 2},
author = {Ravi, Nikhila and Gabeur, Valentin and Hu, Yuan-Ting and Hu, Ronghang and Ryali, Chaitanya and Ma, Tengyu and Khedr, Haitham and R{\"a}dle, Roman and Rolland, Chloe and Gustafson, Laura and Mintun, Eric and Pan, Junting and Alwala, Kalyan Vasudev and Carion, Nicolas and Wu, Chao-Yuan and Girshick, Ross and Doll{\'a}r, Piotr and Feichtenhofer, Christoph},
year = 2024,
month = oct,
number = {arXiv:2408.00714},
eprint = {2408.00714},
primaryclass = {cs},
publisher = {arXiv},
doi = {10.48550/arXiv.2408.00714},
urldate = {2025-12-13},
abstract = {We present Segment Anything Model 2 (SAM 2), a foundation model towards solving promptable visual segmentation in images and videos. We build a data engine, which improves model and data via user interaction, to collect the largest video segmentation dataset to date. Our model is a simple transformer architecture with streaming memory for real-time video processing. SAM 2 trained on our data provides strong performance across a wide range of tasks. In video segmentation, we observe better accuracy, using 3x fewer interactions than prior approaches. In image segmentation, our model is more accurate and 6x faster than the Segment Anything Model (SAM). We believe that our data, model, and insights will serve as a significant milestone for video segmentation and related perception tasks. We are releasing our main model, dataset, as well as code for model training and our demo.},
archiveprefix = {arXiv},
keywords = {Computer Science - Artificial Intelligence,Computer Science - Computer Vision and Pattern Recognition,Computer Science - Machine Learning},
file = {C\:\\Users\\Dustella\\Zotero\\storage\\3SPNWS4W\\Ravi et al. - 2024 - SAM 2 Segment Anything in Images and Videos.pdf;C\:\\Users\\Dustella\\Zotero\\storage\\Q94ECD5C\\2408.html}
}
@misc{ronnebergerUNetConvolutionalNetworks2015,
title = {U-{{Net}}: {{Convolutional Networks}} for {{Biomedical Image Segmentation}}},
shorttitle = {U-{{Net}}},
author = {Ronneberger, Olaf and Fischer, Philipp and Brox, Thomas},
year = 2015,
month = may,
number = {arXiv:1505.04597},
eprint = {1505.04597},
primaryclass = {cs},
publisher = {arXiv},
doi = {10.48550/arXiv.1505.04597},
urldate = {2025-12-13},
abstract = {There is large consent that successful training of deep networks requires many thousand annotated training samples. In this paper, we present a network and training strategy that relies on the strong use of data augmentation to use the available annotated samples more efficiently. The architecture consists of a contracting path to capture context and a symmetric expanding path that enables precise localization. We show that such a network can be trained end-to-end from very few images and outperforms the prior best method (a sliding-window convolutional network) on the ISBI challenge for segmentation of neuronal structures in electron microscopic stacks. Using the same network trained on transmitted light microscopy images (phase contrast and DIC) we won the ISBI cell tracking challenge 2015 in these categories by a large margin. Moreover, the network is fast. Segmentation of a 512x512 image takes less than a second on a recent GPU. The full implementation (based on Caffe) and the trained networks are available at http://lmb.informatik.uni-freiburg.de/people/ronneber/u-net .},
archiveprefix = {arXiv},
keywords = {Computer Science - Computer Vision and Pattern Recognition},
file = {C\:\\Users\\Dustella\\Zotero\\storage\\CZADTUPD\\Ronneberger et al. - 2015 - U-Net Convolutional Networks for Biomedical Image Segmentation.pdf;C\:\\Users\\Dustella\\Zotero\\storage\\FACWYJGW\\1505.html}
}
@misc{ruanVMUNetVisionMamba2024,
title = {{{VM-UNet}}: {{Vision Mamba UNet}} for {{Medical Image Segmentation}}},
shorttitle = {{{VM-UNet}}},
author = {Ruan, Jiacheng and Li, Jincheng and Xiang, Suncheng},
year = 2024,
month = nov,
number = {arXiv:2402.02491},
eprint = {2402.02491},
primaryclass = {eess},
publisher = {arXiv},
doi = {10.48550/arXiv.2402.02491},
urldate = {2025-12-13},
abstract = {In the realm of medical image segmentation, both CNN-based and Transformer-based models have been extensively explored. However, CNNs exhibit limitations in long-range modeling capabilities, whereas Transformers are hampered by their quadratic computational complexity. Recently, State Space Models (SSMs), exemplified by Mamba, have emerged as a promising approach. They not only excel in modeling long-range interactions but also maintain a linear computational complexity. In this paper, leveraging state space models, we propose a U-shape architecture model for medical image segmentation, named Vision Mamba UNet (VM-UNet). Specifically, the Visual State Space (VSS) block is introduced as the foundation block to capture extensive contextual information, and an asymmetrical encoder-decoder structure is constructed with fewer convolution layers to save calculation cost. We conduct comprehensive experiments on the ISIC17, ISIC18, and Synapse datasets, and the results indicate that VM-UNet performs competitively in medical image segmentation tasks. To our best knowledge, this is the first medical image segmentation model constructed based on the pure SSM-based model. We aim to establish a baseline and provide valuable insights for the future development of more efficient and effective SSM-based segmentation systems. Our code is available at https://github.com/JCruan519/VM-UNet.},
archiveprefix = {arXiv},
keywords = {Computer Science - Computer Vision and Pattern Recognition,Electrical Engineering and Systems Science - Image and Video Processing},
file = {C\:\\Users\\Dustella\\Zotero\\storage\\B7GI9E69\\Ruan et al. - 2024 - VM-UNet Vision Mamba UNet for Medical Image Segmentation.pdf;C\:\\Users\\Dustella\\Zotero\\storage\\HS2R2T5Y\\2402.html}
}
@inproceedings{taoConvolutionalTransformerNetworkCrack2023,
title = {A {{Convolutional-Transformer Network}} for {{Crack Segmentation}} with {{Boundary Awareness}}},
booktitle = {2023 {{IEEE International Conference}} on {{Image Processing}} ({{ICIP}})},
author = {Tao, Huaqi and Liu, Bingxi and Cui, Jinqiang and Zhang, Hong},
year = 2023,
month = oct,
eprint = {2302.11728},
primaryclass = {cs},
pages = {86--90},
doi = {10.1109/ICIP49359.2023.10222276},
urldate = {2025-12-13},
abstract = {Cracks play a crucial role in assessing the safety and durability of manufactured buildings. However, the long and sharp topological features and complex background of cracks make the task of crack segmentation extremely challenging. In this paper, we propose a novel convolutional-transformer network based on encoder-decoder architecture to solve this challenge. Particularly, we designed a Dilated Residual Block (DRB) and a Boundary Awareness Module (BAM). The DRB pays attention to the local detail of cracks and adjusts the feature dimension for other blocks as needed. And the BAM learns the boundary features from the dilated crack label. Furthermore, the DRB is combined with a lightweight transformer that captures global information to serve as an effective encoder. Experimental results show that the proposed network performs better than state-of-the-art algorithms on two typical datasets. Datasets, code, and trained models are available for research at https://github.com/HqiTao/CT-crackseg.},
archiveprefix = {arXiv},
keywords = {Computer Science - Computer Vision and Pattern Recognition,Electrical Engineering and Systems Science - Image and Video Processing},
file = {C\:\\Users\\Dustella\\Zotero\\storage\\II7VM2RG\\Tao et al. - 2023 - A Convolutional-Transformer Network for Crack Segmentation with Boundary Awareness.pdf;C\:\\Users\\Dustella\\Zotero\\storage\\W42UIXAD\\2302.html}
}

View File

@ -31,8 +31,8 @@
#normal-box(color: none)[
== Introduction
The Segment Anything Model (SAM) has demonstrated remarkable
zero-shot segmentation capabilities on natural images. However, its zero-shot performance on domain-specific tasks remains underexplored.
The Segment Anything Model (SAM) @raviSAM2Segment2024
has zero-shot segmentation capabilities on natural images. However, its zero-shot performance on domain-specific tasks remains underexplored.
We investigate SAM2's effectiveness for *pavement crack segmentation*, a task characterized by thin, *low-contrast* structures with *complex topologies*.
@ -84,11 +84,7 @@
caption: [Types of Prompts],
) <types-of-prompts>
// #h(0.1pt)
#set text(size: 16pt)
@ -110,17 +106,14 @@
),
caption: [SAM2 Segmentation workflow],
)
Some supervised models are taken into comparison: UNet, DeepCrack, TransUNet,
CT-CrackSeg, VM-UNet, CrackSegMamba.
Some supervised models are taken into comparison: UNet @ronnebergerUNetConvolutionalNetworks2015 , DeepCrack @liuDeepCrackDeepHierarchical2019
CT-CrackSeg @liuDeepCrackDeepHierarchical2019 , VM-UNet @ruanVMUNetVisionMamba2024 , CrackSegMamba @qiCrackSegMambaLightweightMamba2024 , TransUNet @chenTransUNetRethinkingUNet2024.
]
#normal-box(color: none)[
== Experiments and Results
#figure(
image("img/examples.png"),
caption: [Examples of SAM2 results],
)
*Evaluation*
@ -130,23 +123,27 @@
$ bold("F1") = 2 * ("Precision" * "Recall") / ("Precision" + "Recall") $ <f1>
SAM2 with bbox prompts (39.6% IoU) lags behind supervised models, including UNet 2015 @ronnebergerUNetConvolutionalNetworks2015
#figure(
image("img/metrics.png"),
caption: [Model Metrics Comparison ],
)
SAM2 with bbox prompts (39.6% IoU) lags behind supervised models, including UNet 2015.
Bounding box prompts yield the best performance among zero-shot methods. There is a 4.7x performance gap between bbox(39.6% IoU) and 1-point prompts(8.4% IoU).
#figure(
// columns[
image("img/sam_iou.png", width: 14em),
// #colbreak()
// #image("img/sam_f1.png")
// ],
caption: [IoU of SAM2 with 4 prompt strategies],
)
Bounding box prompts yield the best performance among zero-shot methods. There is a 4.7x performance gap between bbox(39.6% IoU) and 1-point prompts(8.4% IoU).
#figure(
image("img/examples.png"),
caption: [Examples of SAM2 results],
)
]
@ -166,22 +163,19 @@
#normal-box(color: none)[
== Key Findings and Discussion
// *Prompt Effectiveness*
1-point prompts perform poorly (12.3% IoU), indicating insufficient guidance for complex crack structures. 5-point prompts approach bbox performance for *highly irregular cracks*, suggesting multiple points help capture shape.
Since SAM was trained on natural images, pavement cracks violate some key assumptions: it *lacks clear object boundaries* , has *low contrast* with background, and exhibits *extreme aspect ratios (length >> width)*.
This highlights limitations of zero-shot approach without fine-tuning.
// *Single Point Prompt Limitations*
1-point prompts perform poorly (12.3% IoU), indicating insufficient guidance for complex crack structures. 5-point prompts approach bbox performance for highly irregular cracks, suggesting multiple points help capture shape.
]
#normal-box(color: none)[
== Conclusion and Future Work
SAM2 shows limited zero-shot capability for crack segmentation. Bounding box prompts significantly outperform point-based prompts. Performance still lags behind supervised methods, indicating need for domain adaptation.
SAM2 shows *limited zero-shot capability for crack segmentation*. Bounding box prompts significantly outperform point-based prompts. Performance still lags behind supervised methods, indicating need for domain adaptation.
]
@ -189,14 +183,21 @@
// Content
#normal-box(color: none)[
== References
#columns()[
#bibliography("./cit.bib", title: none)
]
]
#columns()[
#set text(size: 12pt)
#bibliography("./crack.bib", title: none, full: false)
Hanwen Yu | Email: Hanwen.Yu24\@student.xjtlu.edu.cn
]
// #[
// // align right
// #set align(right)
// 2467345 |
// Hanwen Yu | Email: Hanwen.Yu24\@student.xjtlu.edu.cn
// ]
]
]

View File

@ -158,9 +158,10 @@
[
#place(horizon)[
#set text(size: titletext-size, fill: titletext-color)
*#current-title* #current-subtitle \
#set text(size: 0.5em)
#current-author \
*#current-title* #current-subtitle
#set text(size: 0.6em)
\
#current-author\
#current-affiliation
]
],
@ -175,7 +176,7 @@
// Right
[
#place(top + right, dy: 18pt, dx: 30pt)[#current-logo-1]
#place(top + right, dy: 10pt, dx: 50pt)[#current-logo-1]
],
),
)
@ -187,7 +188,7 @@
height: 100%,
inset: 3%,
fill: bg-color,
stroke: 1pt,
// stroke: 1pt,
columns(col)[#body],
)