# 高度和target_latitude可以自己选择,纬度可以选+-30,+-60,高度可以每10km import logging import netCDF4 as nc import pandas as pd import numpy as np import os from scipy.interpolate import interp1d import matplotlib.pyplot as plt from CONSTANT import DATA_BASEPATH def _process_per_folder(base_folder_path, i, target_h): # 根据i的值调整文件夹名称 if i < 10: folder_name = f"atmPrf_repro2021_2008_00{i}" # 一位数,前面加两个0 elif i < 100: folder_name = f"atmPrf_repro2021_2008_0{i}" # 两位数,前面加一个0 else: folder_name = f"atmPrf_repro2021_2008_{i}" # 三位数,不加0 # 构建当前文件夹的路径 folder_path = os.path.join(base_folder_path, folder_name) # 检查文件夹是否存在 if not os.path.exists(folder_path): print(f"文件夹 {folder_path} 不存在。") return # 遍历文件夹中的文件 result_folder_level = [] folder_level_cache_path = f"{folder_path}/planet_{target_h}.parquet" if os.path.exists(folder_level_cache_path): cache_content = pd.read_parquet(folder_level_cache_path) # logging.info(f"读取缓存文件 {folder_level_cache_path}") # dfs.append(cache_content) return [cache_content] logging.info(f"处理文件夹 {folder_path}") for file_name in os.listdir(folder_path): if not file_name.endswith('.0390_nc'): continue finfo = os.path.join(folder_path, file_name) try: dataset = nc.Dataset(finfo, 'r') # 提取变量数据 temp = dataset.variables['Temp'][:] altitude = dataset.variables['MSL_alt'][:] lat = dataset.variables['Lat'][:] lon = dataset.variables['Lon'][:] # 读取month, hour, minute month = dataset.month hour = dataset.hour minute = dataset.minute # 将i的值作为day的值 day = i # 将day, hour, minute拼接成一个字符串 datetime_str = f"{day:03d}{hour:02d}{minute:02d}" # 确保所有变量都是一维数组 temp = np.squeeze(temp) altitude = np.squeeze(altitude) lat = np.squeeze(lat) lon = np.squeeze(lon) # 检查所有数组的长度是否相同 assert len(temp) == len(altitude) == len(lat) == len( lon), "Arrays must be the same length" # 创建DataFrame,并将datetime_str作为第一列添加 df = pd.DataFrame({ 'Datetime_str': datetime_str, 'Longitude': lon, 'Latitude': lat, 'Altitude': altitude, 'Temperature': temp }) dataset.close() # 仅筛选高度 df_filtered = df[(df['Altitude'] >= target_h - 0.5) & (df['Altitude'] <= target_h + 0.5)] result_folder_level.append(df_filtered) except Exception as e: print(f"处理文件 {finfo} 时出错: {e}") # save to parquet # merge them to a big dataframe, save to parquet result_folder_level = pd.concat( result_folder_level, axis=0, ignore_index=True) logging.info(f"保存缓存文件 {folder_level_cache_path}") logging.info(f"result_folder_length: {result_folder_level.__len__()}") result_folder_level.to_parquet(folder_level_cache_path) return [result_folder_level] def cosmic_planet_daily_process( year=2008, target_latitude=30, target_h=40 ): # 用于存储所有DataFrame的列表 dfs = [] # 设置文件夹的路径模板 base_folder_path = f"{DATA_BASEPATH.cosmic}/{year}" cache_path = f"{base_folder_path}/planet_{target_h}_{target_latitude}.parquet" if os.path.exists(cache_path): return pd.read_parquet(cache_path) # 遍历文件夹序号1到365 for i in range(1, 365): # 根据i的值调整文件夹名称 if i < 10: folder_name = f"atmPrf_repro2021_2008_00{i}" # 一位数,前面加两个0 elif i < 100: folder_name = f"atmPrf_repro2021_2008_0{i}" # 两位数,前面加一个0 else: folder_name = f"atmPrf_repro2021_2008_{i}" # 三位数,不加0 # 构建当前文件夹的路径 folder_path = os.path.join(base_folder_path, folder_name) # 检查文件夹是否存在 if not os.path.exists(folder_path): print(f"文件夹 {folder_path} 不存在。") continue # 遍历文件夹中的文件 result_folder_level = _process_per_folder( base_folder_path, i, target_h) if result_folder_level is not None: dfs.extend(result_folder_level) # 按行拼接所有DataFrame final_df = pd.concat(dfs, axis=0, ignore_index=True) # 假设 final_df 是你的大 DataFrame # 计算差值找到最接近的索引来筛选纬度为30的数据 # 目标值 # 计算 'Latitude' 列与目标值的差异 final_df['Latitude_diff'] = np.abs(final_df['Latitude'] - target_latitude) # 按天分组并找到每一天最接近的行 def find_closest_row(group): return group.loc[group['Latitude_diff'].idxmin()] logging.info(f"final_df: {final_df.head()}") # 使用groupby按Day分组并找到每个分组的最接近的记录 closest_per_day = final_df.groupby('Datetime_str', as_index=False).apply( find_closest_row, include_groups=False) # 删除不需要的列 closest_per_day = closest_per_day.drop(columns=['Latitude_diff']) closest_per_day = closest_per_day[(closest_per_day['Latitude'] >= ( target_latitude - 2)) & (closest_per_day['Latitude'] <= (target_latitude + 2))] # 将经度转换为弧度制单位 closest_per_day['Longitude_rad'] = np.radians(closest_per_day['Longitude']) print(closest_per_day) # 函数:将 Datetime_str 转换为以天为单位的时间 def convert_to_days(datetime_str): # 获取日期、小时和分钟 try: dd = int(datetime_str[:3]) # 日期部分 (DDD) hh = int(datetime_str[3:5]) # 小时部分 (HH) mm = int(datetime_str[5:]) # 分钟部分 (MM) # 计算总分钟数 total_minutes = dd * 1440 + hh * 60 + mm # 转换为天数 return total_minutes / 1440 except Exception as e: logging.error(f"转换失败: {datetime_str}") # 应用函数转换 try: closest_per_day['Time'] = closest_per_day['Datetime_str'].apply( convert_to_days) except Exception as e: print(e) # 时间t # day_data = closest_per_day['Time'] # 计算背景温度,时间Day和弧度经度的函数 background_temperature = closest_per_day['Temperature'].mean() # 计算温度扰动 closest_per_day['Temperature_perturbation'] = closest_per_day['Temperature'] - \ background_temperature # 选择需要的列并重命名 selected_columns = closest_per_day[[ 'Longitude_rad', 'Time', 'Temperature']] selected_columns.columns = ['Longitude_Radians', 'Time', 'Temperature'] # 输出到 txt 文件 # save to parquet selected_columns.to_parquet(cache_path) return selected_columns # output_file_path = 'cosmic.txt' # 你可以修改这个路径来指定文件的保存位置 # selected_columns.to_csv(output_file_path, sep='\t', index=False) # # 输出成功提示 # print("输出成功,文件已保存为 'cosmic.txt'")