117 lines
3.4 KiB
Python
117 lines
3.4 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Verify Crack500 dataset structure and data files
|
|
Checks train.txt, val.txt, test.txt and validates all paths
|
|
"""
|
|
|
|
from pathlib import Path
|
|
import argparse
|
|
|
|
|
|
def verify_data_files(data_root, split_files):
|
|
"""Verify dataset files exist and are properly formatted"""
|
|
data_root = Path(data_root)
|
|
print(f"Verifying dataset at: {data_root}")
|
|
print("="*60)
|
|
|
|
total_stats = {}
|
|
|
|
for split_name, split_file in split_files.items():
|
|
print(f"\n{split_name.upper()} SET:")
|
|
print("-"*60)
|
|
|
|
if not Path(split_file).exists():
|
|
print(f"❌ File not found: {split_file}")
|
|
continue
|
|
|
|
# Read file
|
|
with open(split_file, 'r') as f:
|
|
lines = [line.strip() for line in f if line.strip()]
|
|
|
|
print(f"Total samples: {len(lines)}")
|
|
|
|
# Verify paths
|
|
valid_count = 0
|
|
missing_images = []
|
|
missing_masks = []
|
|
|
|
for line in lines:
|
|
parts = line.split()
|
|
if len(parts) != 2:
|
|
print(f"⚠️ Invalid format: {line}")
|
|
continue
|
|
|
|
img_rel, mask_rel = parts
|
|
img_path = data_root / img_rel
|
|
mask_path = data_root / mask_rel
|
|
|
|
if not img_path.exists():
|
|
missing_images.append(str(img_path))
|
|
if not mask_path.exists():
|
|
missing_masks.append(str(mask_path))
|
|
|
|
if img_path.exists() and mask_path.exists():
|
|
valid_count += 1
|
|
|
|
print(f"Valid samples: {valid_count}")
|
|
if missing_images:
|
|
print(f"❌ Missing images: {len(missing_images)}")
|
|
if len(missing_images) <= 5:
|
|
for img in missing_images:
|
|
print(f" - {img}")
|
|
if missing_masks:
|
|
print(f"❌ Missing masks: {len(missing_masks)}")
|
|
if len(missing_masks) <= 5:
|
|
for mask in missing_masks:
|
|
print(f" - {mask}")
|
|
|
|
if valid_count == len(lines):
|
|
print("✅ All paths valid!")
|
|
|
|
total_stats[split_name] = {
|
|
'total': len(lines),
|
|
'valid': valid_count,
|
|
'missing_images': len(missing_images),
|
|
'missing_masks': len(missing_masks)
|
|
}
|
|
|
|
# Summary
|
|
print("\n" + "="*60)
|
|
print("SUMMARY:")
|
|
print("="*60)
|
|
for split_name, stats in total_stats.items():
|
|
print(f"{split_name}: {stats['valid']}/{stats['total']} valid samples")
|
|
|
|
return total_stats
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description='Verify Crack500 dataset')
|
|
parser.add_argument('--data_root', type=str, default='./crack500')
|
|
parser.add_argument('--train_file', type=str, default='./crack500/train.txt')
|
|
parser.add_argument('--val_file', type=str, default='./crack500/val.txt')
|
|
parser.add_argument('--test_file', type=str, default='./crack500/test.txt')
|
|
|
|
args = parser.parse_args()
|
|
|
|
split_files = {
|
|
'train': args.train_file,
|
|
'val': args.val_file,
|
|
'test': args.test_file
|
|
}
|
|
|
|
stats = verify_data_files(args.data_root, split_files)
|
|
|
|
# Check if all valid
|
|
all_valid = all(s['valid'] == s['total'] for s in stats.values())
|
|
if all_valid:
|
|
print("\n✅ Dataset verification passed!")
|
|
return 0
|
|
else:
|
|
print("\n❌ Dataset verification failed!")
|
|
return 1
|
|
|
|
|
|
if __name__ == '__main__':
|
|
exit(main())
|