import glob import re import pandas as pd import time import logging from modules.balloon.extract_wave import extract_wave, is_terrain_wave from modules.balloon.read_data import read_data filter_columns = [ "file_name", "c", "a", "b", "omega_upper", "w_f", "ver_wave_len", "hori_wave_len", "c_x", "c_y", "c_z", "Ek", "E_p", "MFu", "MFv", "u1", "v1", "T1", "zhou_qi", ] lat = 52.21 g = 9.76 combos = {} comboType = [ "探空气球", "流星雷达", "Saber", "TIDI", "COSMIC", ] comboMode = [ ["重力波单次", "重力波统计"], ["重力波月统计", "潮汐波单次", "潮汐波月统计"], ["行星波月统计", "重力波单次", "重力波月统计"], ["行星波月统计"], ["行星波月统计"], ] comboDate = [ [["年", "时间"], ["起始年", "终止年"]], [["年", "月"], ["年", "日期"], ["年", "月"]], [["起始月", "-"], ["月", "日"], ["月", "-"]], [["起始月", "-"]], [["起始月", "-"]], ] def get_ballon_files(): try: data = glob.glob("data/探空气球/**/*.nc", recursive=True) except FileNotFoundError: return [] return data all_ballon_files = get_ballon_files() def get_ballon_path_by_year(start_year, end_year): return list(filter( lambda x: any(f"LIN-{year}" in x for year in range( start_year, end_year + 1)), all_ballon_files )) def get_ballon_full_df_by_year(start_year, end_year): # Set up logging logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(levelname)s: %(message)s') start_time = time.time() logging.debug( f"Starting get_ballon_full_df_by_year with start_year={start_year}, end_year={end_year}") # Timing the path retrieval t0 = time.time() paths = get_ballon_path_by_year(start_year, end_year) t1 = time.time() logging.debug(f"Retrieved {len(paths)} paths in {t1 - t0:.2f} seconds") # optimization: add cache. only select need to be reprocessed with open("./cache/ballon_lin_has_wave", "r") as f: cache_has_waves = f.readlines() cache_has_waves = [x.strip() for x in cache_has_waves] year_df = pd.DataFrame() for idx, file in enumerate(paths, 1): if len(cache_has_waves) > 0 and file not in cache_has_waves: logging.debug(f"Skipping {file} as it has no wave data") continue file_start_time = time.time() logging.debug(f"Processing file {idx}/{len(paths)}: {file}") # Read data data = read_data(file) read_time = time.time() logging.debug( f"Read data in {read_time - file_start_time:.2f} seconds") # Extract wave try: wave = extract_wave(data, lat, g) extract_time = time.time() logging.debug( f"Extracted wave in {extract_time - read_time:.2f} seconds") except Exception as e: logging.error(f"Error extracting wave from {file}: {e}") wave = [] extract_time = time.time() if len(wave) == 0: logging.debug(f"No wave data in {file}, skipping") continue # Determine terrain wave c = is_terrain_wave(data, lat, g) terrain_time = time.time() year_pattern = r"products-RS92-GDP.2-LIN-(\d{4})" year = int(re.search(year_pattern, file).group(1)) logging.debug( f"Determined terrain wave in {terrain_time - extract_time:.2f} seconds") # Build DataFrame line wave.insert(0, c) wave.insert(0, file) line = pd.DataFrame([wave], columns=filter_columns) concat_start_time = time.time() # Concatenate DataFrame year_df = pd.concat([year_df, line], ignore_index=True) concat_time = time.time() logging.debug( f"Concatenated DataFrame in {concat_time - concat_start_time:.2f} seconds") logging.debug( f"Total time for {file}: {concat_time - file_start_time:.2f} seconds") total_time = time.time() - start_time logging.debug( f"Completed get_ballon_full_df_by_year in {total_time:.2f} seconds") return year_df def get_has_wave_data_by_year(start_year, end_year): df = get_ballon_full_df_by_year(start_year, end_year) return df[df["b"] == 1]