-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathGLM_G211.py
171 lines (143 loc) · 6.55 KB
/
GLM_G211.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
import argparse
import datetime
import G211
import glm as myglm
from glmtools.io.glm import GLMDataset
import logging
import matplotlib.pyplot as plt
from multiprocessing import Pool
import numpy as np
import os
import pandas as pd
import pdb
from scipy.spatial import KDTree
import sys
from tqdm import tqdm
import xarray
logging.basicConfig(format='%(asctime)s %(message)s', level=logging.INFO)
def get_argparser():
parser = argparse.ArgumentParser(description = "Accumulate GLM flashes for one hour on G211 grid.",
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('center', help="center of accumation time window")
parser.add_argument('twin', type=float, help="total width of accumulation time window in hours")
parser.add_argument("--clobber", action='store_true', help="clobber existing file(s)")
parser.add_argument("--maxbad", type=int, default=0, help= (
"maximum number of corrupt or missing GLM 20-second files in time window to return a file"
)
)
parser.add_argument('--pool', type=int, default=18, help="workers in pool")
parser.add_argument("-d", "--debug", action='store_true')
parser.add_argument("--odir", default="/glade/campaign/mmm/parc/ahijevyc/GLM", help="output path")
return parser
def bincount(l2, tree, lon_range, lat_range, n):
logging.debug(f"load GLMDataset {os.path.basename(l2)}")
try:
glm = GLMDataset(l2)
except Exception as error:
logging.warning(f"GLMDataset ({l2}) {error}")
return
# Get flashes in lat/lon bounds, as flashes are scattered across the full disk.
flashes_subset = glm.subset_flashes(lon_range = lon_range, lat_range = lat_range)
logging.debug(f"{flashes_subset.number_of_flashes.size} flashes")
if flashes_subset.number_of_flashes.size == 0:
return np.zeros(n)
dd, ii = tree.query(np.c_[flashes_subset.flash_lon, flashes_subset.flash_lat], distance_upper_bound=0.5)
bc = np.bincount(ii) # number of occurences of each ii value
bc = np.pad(bc, (0, n)) # pad with n zeros on the right side
bc = bc[0:n]
return bc
def accum_on_grid(ifiles, lon, lat, maxbad=0, pool=18):
# allow maxbad bad times (only 20s each)
lon_range = (lon.min(), lon.max())
lat_range = (lat.min(), lat.max())
logging.info(f"lon_range {lon_range} lat_range {lat_range}")
tree = KDTree(np.c_[lon.ravel(), lat.ravel()])
logging.info(f"process {len(ifiles)} GLM Datasets (level 2)")
if pool > 1:
items = [(ifile, tree, lon_range, lat_range, lon.size) for ifile in ifiles]
with Pool(pool) as p:
result = p.starmap(bincount, tqdm(items, total=len(ifiles)))
else:
result = [bincount(ifile, tree, lon_range, lat_range, lon.size) for ifile in ifiles]
# Filter out None's. Those are returned if GLMDataset is corrupt.
result = [x for x in result if x is not None]
nbad = len(ifiles) - len(result)
assert nbad <= maxbad, f"too many bad files ({nbad}/{maxbad+1})."
if nbad:
logging.warning(f"{nbad} bad files (max {maxbad+1}).")
flashes = np.array(result).sum(axis=0)
flashes = flashes.reshape(lon.shape)
#flashes = flashes.astype(np.int32) # didn't reduce filesize
flashes = xarray.DataArray(data=flashes, name="flashes", dims=["y","x"],
coords=dict(lon=(["y","x"],lon), lat=(["y","x"],lat)))
return flashes
def main():
"""
Download GLM for time range (twin).
Accumulate on G211 grid (40km) and half-spacing grid (20km).
Allow some missing data in the twin-hr window.
For example, the maximum number of missing files (maxbad)
could be equal to the time window in hours. There are 180 files per hour, so 1/180 = 0.6% can be missing.
If there are more bad files than maxbad, then output file is not created.
"""
parser = get_argparser()
args = parser.parse_args()
clobber = args.clobber
twin = args.twin
center = pd.to_datetime(args.center)
start = center - datetime.timedelta(hours=twin/2)
end = center + datetime.timedelta(hours=twin/2)
odir = os.path.join(args.odir, center.strftime('%Y'))
if args.debug:
logging.getLogger().setLevel(logging.DEBUG)
bucket = "noaa-goes16"
logging.info(f"download data [{start},{end}]")
list_of_level2_files = myglm.download(start, end, bucket=bucket, clobber=clobber)
assert list_of_level2_files is not None, f"glm download {start} {end} {bucket} failed"
if len(list_of_level2_files) == 0:
logging.error(f"no level2 files found")
sys.exit(1)
# make year directory
if not os.path.isdir(odir):
logging.info(f"making new directory {odir}")
os.makedirs(odir)
# Global attributes of output netCDF file.
attrs = G211.g211.proj4_params
attrs.update(
dict(
time_coverage_start=start.isoformat(),
time_coverage_center=center.isoformat(),
time_coverage_end=end.isoformat(),
bucket=bucket,
maxbad=args.maxbad,
twin=twin,
)
)
ofile = os.path.join(odir, center.strftime("%Y%m%d_%H%M") + f".glm_40km_{twin:.0f}hr.nc")
if os.path.exists(ofile) and not clobber:
logging.warning(f"found {ofile} skipping.")
else:
flashes = accum_on_grid(list_of_level2_files, G211.lon, G211.lat, maxbad=args.maxbad, pool=args.pool)
saveflashes(flashes, center, attrs, ofile)
# Now do half-distance grid (half the 40km half-grid spacing of G211)
ofile = os.path.join(odir, center.strftime("%Y%m%d_%H%M") + f".glm_20km_{twin:.0f}hr.nc")
if os.path.exists(ofile) and not clobber:
logging.warning(f"found {ofile} skipping.")
else:
grid = G211.x2()
lon, lat = grid.lon, grid.lat
flashes = accum_on_grid(list_of_level2_files, lon, lat, maxbad=args.maxbad, pool=args.pool)
saveflashes(flashes, center, attrs, ofile)
def saveflashes(flashes, center, attrs, ofile):
flashes = flashes.expand_dims(time=[center])
flashes.attrs.update(attrs)
logging.info(f"{flashes.sum().values} flashes over domain. max {flashes.values.max()} in one cell")
# Set encoding to minutes or hours since... or else it will be an integer number of days with no fraction.
flashes.time.encoding["units"] = "minutes since "+center.isoformat()
flashes.encoding["zlib"] = True
# got about 20% compression with complevel=9 or default of 4.
#flashes.encoding["complevel"] = 9
flashes.to_netcdf(ofile,unlimited_dims=["time"])
logging.info(f"created {ofile}")
if __name__ == "__main__":
main()