#!/usr/bin/env python3 """ Verify Crack500 dataset structure and data files Checks train.txt, val.txt, test.txt and validates all paths """ from pathlib import Path import argparse def verify_data_files(data_root, split_files): """Verify dataset files exist and are properly formatted""" data_root = Path(data_root) print(f"Verifying dataset at: {data_root}") print("="*60) total_stats = {} for split_name, split_file in split_files.items(): print(f"\n{split_name.upper()} SET:") print("-"*60) if not Path(split_file).exists(): print(f"❌ File not found: {split_file}") continue # Read file with open(split_file, 'r') as f: lines = [line.strip() for line in f if line.strip()] print(f"Total samples: {len(lines)}") # Verify paths valid_count = 0 missing_images = [] missing_masks = [] for line in lines: parts = line.split() if len(parts) != 2: print(f"⚠️ Invalid format: {line}") continue img_rel, mask_rel = parts img_path = data_root / img_rel mask_path = data_root / mask_rel if not img_path.exists(): missing_images.append(str(img_path)) if not mask_path.exists(): missing_masks.append(str(mask_path)) if img_path.exists() and mask_path.exists(): valid_count += 1 print(f"Valid samples: {valid_count}") if missing_images: print(f"❌ Missing images: {len(missing_images)}") if len(missing_images) <= 5: for img in missing_images: print(f" - {img}") if missing_masks: print(f"❌ Missing masks: {len(missing_masks)}") if len(missing_masks) <= 5: for mask in missing_masks: print(f" - {mask}") if valid_count == len(lines): print("✅ All paths valid!") total_stats[split_name] = { 'total': len(lines), 'valid': valid_count, 'missing_images': len(missing_images), 'missing_masks': len(missing_masks) } # Summary print("\n" + "="*60) print("SUMMARY:") print("="*60) for split_name, stats in total_stats.items(): print(f"{split_name}: {stats['valid']}/{stats['total']} valid samples") return total_stats def main(): parser = argparse.ArgumentParser(description='Verify Crack500 dataset') parser.add_argument('--data_root', type=str, default='./crack500') parser.add_argument('--train_file', type=str, default='./crack500/train.txt') parser.add_argument('--val_file', type=str, default='./crack500/val.txt') parser.add_argument('--test_file', type=str, default='./crack500/test.txt') args = parser.parse_args() split_files = { 'train': args.train_file, 'val': args.val_file, 'test': args.test_file } stats = verify_data_files(args.data_root, split_files) # Check if all valid all_valid = all(s['valid'] == s['total'] for s in stats.values()) if all_valid: print("\n✅ Dataset verification passed!") return 0 else: print("\n❌ Dataset verification failed!") return 1 if __name__ == '__main__': exit(main())