2025-02-13 20:03:12 +08:00

164 lines
4.4 KiB
Python

import glob
import re
import pandas as pd
import balloon
import time
import logging
filter_columns = [
"file_name",
"c",
"a",
"b",
"omega_upper",
"w_f",
"ver_wave_len",
"hori_wave_len",
"c_x",
"c_y",
"c_z",
"Ek",
"E_p",
"MFu",
"MFv",
"u1",
"v1",
"T1",
"zhou_qi",
]
lat = 52.21
g = 9.76
combos = {}
comboType = [
"探空气球",
"流星雷达",
"Saber",
"TIDI",
"COSMIC",
]
comboMode = [
["重力波单次", "重力波统计"],
["重力波月统计", "潮汐波单次", "潮汐波月统计"],
["行星波月统计", "重力波单次", "重力波月统计"],
["行星波月统计"],
["行星波月统计"],
]
comboDate = [
[["", "时间"], ["起始年", "终止年"]],
[["", ""], ["", "日期"], ["", ""]],
[["起始月", "-"], ["", ""], ["", "-"]],
[["起始月", "-"]],
[["起始月", "-"]],
]
def get_ballon_files():
try:
data = glob.glob("data/探空气球/**/*.nc", recursive=True)
except FileNotFoundError:
return []
return data
all_ballon_files = get_ballon_files()
def get_ballon_path_by_year(start_year, end_year):
return list(filter(
lambda x: any(f"LIN-{year}" in x for year in range(
start_year, end_year + 1)),
all_ballon_files
))
def get_ballon_full_df_by_year(start_year, end_year):
# Set up logging
logging.basicConfig(level=logging.DEBUG,
format='%(asctime)s %(levelname)s: %(message)s')
start_time = time.time()
logging.debug(
f"Starting get_ballon_full_df_by_year with start_year={start_year}, end_year={end_year}")
# Timing the path retrieval
t0 = time.time()
paths = get_ballon_path_by_year(start_year, end_year)
t1 = time.time()
logging.debug(f"Retrieved {len(paths)} paths in {t1 - t0:.2f} seconds")
# optimization: add cache. only select need to be reprocessed
with open("./cache/ballon_lin_has_wave", "r") as f:
cache_has_waves = f.readlines()
cache_has_waves = [x.strip() for x in cache_has_waves]
year_df = pd.DataFrame()
for idx, file in enumerate(paths, 1):
if len(cache_has_waves) > 0 and file not in cache_has_waves:
logging.debug(f"Skipping {file} as it has no wave data")
continue
file_start_time = time.time()
logging.debug(f"Processing file {idx}/{len(paths)}: {file}")
# Read data
data = balloon.read_data(file)
read_time = time.time()
logging.debug(
f"Read data in {read_time - file_start_time:.2f} seconds")
# Extract wave
try:
wave = balloon.extract_wave(data, lat, g)
extract_time = time.time()
logging.debug(
f"Extracted wave in {extract_time - read_time:.2f} seconds")
except Exception as e:
logging.error(f"Error extracting wave from {file}: {e}")
wave = []
extract_time = time.time()
if len(wave) == 0:
logging.debug(f"No wave data in {file}, skipping")
continue
# Determine terrain wave
c = balloon.is_terrain_wave(data, lat, g)
terrain_time = time.time()
year_pattern = r"products-RS92-GDP.2-LIN-(\d{4})"
year = int(re.search(year_pattern, file).group(1))
logging.debug(
f"Determined terrain wave in {terrain_time - extract_time:.2f} seconds")
# Build DataFrame line
wave.insert(0, c)
wave.insert(0, file)
line = pd.DataFrame([wave], columns=filter_columns)
concat_start_time = time.time()
# Concatenate DataFrame
year_df = pd.concat([year_df, line], ignore_index=True)
concat_time = time.time()
logging.debug(
f"Concatenated DataFrame in {concat_time - concat_start_time:.2f} seconds")
logging.debug(
f"Total time for {file}: {concat_time - file_start_time:.2f} seconds")
total_time = time.time() - start_time
logging.debug(
f"Completed get_ballon_full_df_by_year in {total_time:.2f} seconds")
return year_df
def get_has_wave_data_by_year(start_year, end_year):
df = get_ballon_full_df_by_year(start_year, end_year)
return df[df["b"] == 1]