sam_crack/scripts/prepare_data.py
2025-12-24 17:15:36 +08:00

117 lines
3.4 KiB
Python

#!/usr/bin/env python3
"""
Verify Crack500 dataset structure and data files
Checks train.txt, val.txt, test.txt and validates all paths
"""
from pathlib import Path
import argparse
def verify_data_files(data_root, split_files):
"""Verify dataset files exist and are properly formatted"""
data_root = Path(data_root)
print(f"Verifying dataset at: {data_root}")
print("="*60)
total_stats = {}
for split_name, split_file in split_files.items():
print(f"\n{split_name.upper()} SET:")
print("-"*60)
if not Path(split_file).exists():
print(f"❌ File not found: {split_file}")
continue
# Read file
with open(split_file, 'r') as f:
lines = [line.strip() for line in f if line.strip()]
print(f"Total samples: {len(lines)}")
# Verify paths
valid_count = 0
missing_images = []
missing_masks = []
for line in lines:
parts = line.split()
if len(parts) != 2:
print(f"⚠️ Invalid format: {line}")
continue
img_rel, mask_rel = parts
img_path = data_root / img_rel
mask_path = data_root / mask_rel
if not img_path.exists():
missing_images.append(str(img_path))
if not mask_path.exists():
missing_masks.append(str(mask_path))
if img_path.exists() and mask_path.exists():
valid_count += 1
print(f"Valid samples: {valid_count}")
if missing_images:
print(f"❌ Missing images: {len(missing_images)}")
if len(missing_images) <= 5:
for img in missing_images:
print(f" - {img}")
if missing_masks:
print(f"❌ Missing masks: {len(missing_masks)}")
if len(missing_masks) <= 5:
for mask in missing_masks:
print(f" - {mask}")
if valid_count == len(lines):
print("✅ All paths valid!")
total_stats[split_name] = {
'total': len(lines),
'valid': valid_count,
'missing_images': len(missing_images),
'missing_masks': len(missing_masks)
}
# Summary
print("\n" + "="*60)
print("SUMMARY:")
print("="*60)
for split_name, stats in total_stats.items():
print(f"{split_name}: {stats['valid']}/{stats['total']} valid samples")
return total_stats
def main():
parser = argparse.ArgumentParser(description='Verify Crack500 dataset')
parser.add_argument('--data_root', type=str, default='./crack500')
parser.add_argument('--train_file', type=str, default='./crack500/train.txt')
parser.add_argument('--val_file', type=str, default='./crack500/val.txt')
parser.add_argument('--test_file', type=str, default='./crack500/test.txt')
args = parser.parse_args()
split_files = {
'train': args.train_file,
'val': args.val_file,
'test': args.test_file
}
stats = verify_data_files(args.data_root, split_files)
# Check if all valid
all_valid = all(s['valid'] == s['total'] for s in stats.values())
if all_valid:
print("\n✅ Dataset verification passed!")
return 0
else:
print("\n❌ Dataset verification failed!")
return 1
if __name__ == '__main__':
exit(main())