2025-03-05 11:40:19 +08:00

208 lines
6.6 KiB
Python

import glob
import os
import re
import pandas as pd
import time
import logging
from CONSTANT import DATA_BASEPATH
from modules.balloon.extract_wave import extract_wave, is_terrain_wave
from modules.balloon.read_data import read_data
filter_columns = [
"file_name",
"c",
"a",
"b",
"omega_upper",
"w_f",
"ver_wave_len",
"hori_wave_len",
"c_x",
"c_y",
"c_z",
"Ek",
"E_p",
"MFu",
"MFv",
"u1",
"v1",
"T1",
"zhou_qi",
]
lat = 52.21
g = 9.76
combos = {}
comboType = [
"探空气球",
"流星雷达",
"Saber",
"TIDI",
"COSMIC",
]
comboMode = [
["重力波单次", "重力波统计"],
["重力波月统计", "潮汐波单次", "潮汐波月统计"],
["行星波月统计", "重力波单次", "重力波月统计"],
["行星波月统计"],
["行星波月统计"],
]
comboDate = [
[["", "时间"], ["起始年", "终止年"]],
[["", ""], ["", "日期"], ["", ""]],
[["起始月", "-"], ["", ""], ["", "-"]],
[["起始月", "-"]],
[["起始月", "-"]],
]
def get_dataframe_between_year(all_year_data, start_year, end_year, station):
res = all_year_data
filtered_res = res[(res['file_name'].str.extract(rf'{station}-(\d{{4}})')[0].astype(int) >= start_year) &
(res['file_name'].str.extract(rf'{station}-(\d{{4}})')[0].astype(int) <= end_year)]
return filtered_res
def get_ballon_files():
try:
data = glob.glob(f"{DATA_BASEPATH.balloon}/**/*.nc", recursive=True)
except FileNotFoundError:
return []
return data
def get_ballon_path_by_year(start_year, end_year, station=None):
all_ballon_files = get_ballon_files()
return list(filter(
lambda x: any(f"{station}-{year}" in x for year in range(
start_year, end_year + 1)),
all_ballon_files
))
def get_ballon_full_df_by_year(start_year, end_year, station=None, ignore_cache=False):
# Set up logging
logging.basicConfig(level=logging.DEBUG,
format='%(asctime)s %(levelname)s: %(message)s')
if (station is None):
raise ValueError("Station is required")
cache_dir = f"{DATA_BASEPATH.balloon}/cache/{station}-b{start_year}-e{end_year}.parquet"
cache_base_dir = f"{DATA_BASEPATH.balloon}/cache/"
if os.path.exists(cache_dir) and not ignore_cache:
logging.debug(f"Reading cache from {cache_dir}")
return pd.read_parquet(cache_dir)
# check if there is any file begin with cache base dir, or say ,filename begin with station
# if so, read the file and return
if os.path.exists(cache_base_dir):
for file in os.listdir(cache_base_dir):
# should match {station}-bxxxx-exxxx.parquet, xxxx is year
if re.match(f"{station}-b\d{{4}}-e\d{{4}}.parquet", file):
# extract begin year and end year from filename
b_year = int(file.split("-")[1][1:])
e_year = int(file.split("-")[2][1:].split(".")[0])
logging.debug(
f"Found cache file {file}, b_year={b_year}, e_year={e_year}")
if b_year <= start_year and e_year >= end_year:
wider_df = pd.read_parquet(f"{cache_base_dir}/{file}")
return get_dataframe_between_year(wider_df, start_year, end_year, station)
start_time = time.time()
logging.debug(
f"Starting get_ballon_full_df_by_year with start_year={start_year}, end_year={end_year}")
# Timing the path retrieval
t0 = time.time()
paths = get_ballon_path_by_year(start_year, end_year, station)
# if no path found, raise
if len(paths) == 0:
raise ValueError(
f"No balloon data found for {station} between {start_year} and {end_year}")
t1 = time.time()
logging.debug(f"Retrieved {len(paths)} paths in {t1 - t0:.2f} seconds")
# optimization: add cache. only select need to be reprocessed
# with open("./cache/ballon_lin_has_wave", "r") as f:
# cache_has_waves = f.readlines()
# cache_has_waves = [x.strip() for x in cache_has_waves]
year_df = pd.DataFrame()
for idx, file in enumerate(paths, 1):
file_start_time = time.time()
logging.debug(f"Processing file {idx}/{len(paths)}: {file}")
# Read data
data = read_data(file)
read_time = time.time()
logging.debug(
f"Read data in {read_time - file_start_time:.2f} seconds")
# Extract wave
try:
wave = extract_wave(data, lat, g)
extract_time = time.time()
logging.debug(
f"Extracted wave in {extract_time - read_time:.2f} seconds")
except Exception as e:
logging.error(f"Error extracting wave from {file}: {e}")
wave = []
extract_time = time.time()
if len(wave) == 0:
logging.debug(f"No wave data in {file}, skipping")
continue
# Determine terrain wave
try:
c = is_terrain_wave(data, lat, g)
terrain_time = time.time()
except Exception as e:
logging.error(f"Error determining terrain wave from {file}: {e}")
continue
logging.debug(
f"Determined terrain wave in {terrain_time - extract_time:.2f} seconds")
# Build DataFrame line
wave.insert(0, c)
wave.insert(0, file)
line = pd.DataFrame([wave], columns=filter_columns)
concat_start_time = time.time()
# Concatenate DataFrame
year_df = pd.concat([year_df, line], ignore_index=True)
concat_time = time.time()
logging.debug(
f"Concatenated DataFrame in {concat_time - concat_start_time:.2f} seconds")
logging.debug(
f"Total time for {file}: {concat_time - file_start_time:.2f} seconds")
total_time = time.time() - start_time
logging.debug(
f"Completed get_ballon_full_df_by_year in {total_time:.2f} seconds")
year_df['hori_wave_len'] = year_df['hori_wave_len'].apply(
lambda x: float(x) if x != ' ' else None)
# save to cache
# if parent folder does not exist, create it
if not os.path.exists(os.path.dirname(cache_dir)):
os.makedirs(os.path.dirname(cache_dir))
year_df.to_parquet(cache_dir)
return year_df
def get_has_wave_data_by_year(start_year, end_year):
df = get_ballon_full_df_by_year(start_year, end_year)
return df[df["b"] == 1]