Coverage for src/CSET/operators/read.py: 100%

79 statements  

« prev     ^ index     » next       coverage.py v7.5.4, created at 2024-07-02 08:44 +0000

1# Copyright 2022-2023 Met Office and contributors. 

2# 

3# Licensed under the Apache License, Version 2.0 (the "License"); 

4# you may not use this file except in compliance with the License. 

5# You may obtain a copy of the License at 

6# 

7# http://www.apache.org/licenses/LICENSE-2.0 

8# 

9# Unless required by applicable law or agreed to in writing, software 

10# distributed under the License is distributed on an "AS IS" BASIS, 

11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 

12# See the License for the specific language governing permissions and 

13# limitations under the License. 

14 

15"""Operators for reading various types of files from disk.""" 

16 

17import ast 

18import logging 

19import warnings 

20from collections.abc import Iterable 

21from pathlib import Path 

22 

23import iris 

24import iris.coords 

25import iris.cube 

26import numpy as np 

27 

28 

29class NoDataWarning(UserWarning): 

30 """Warning that no data has been loaded.""" 

31 

32 

33def read_cube( 

34 loadpath: Path, 

35 constraint: iris.Constraint = None, 

36 filename_pattern: str = "*", 

37 **kwargs, 

38) -> iris.cube.Cube: 

39 """Read a single cube from files. 

40 

41 Read operator that takes a path string (can include wildcards), and uses 

42 iris to load the cube matching the constraint. 

43 

44 If the loaded data is split across multiple files, a filename_pattern can be 

45 specified to select the read files using Unix shell-style wildcards. In this 

46 case the loadpath should point to the directory containing the data. 

47 

48 Ensemble data can also be loaded. If it has a realization coordinate 

49 already, it will be directly used. If not, it will have its member number 

50 guessed from the filename, based on one of several common patterns. For 

51 example the pattern *emXX*, where XX is the realization. 

52 

53 Deterministic data will be loaded with a realization of 0, allowing it to be 

54 processed in the same way as ensemble data. 

55 

56 Arguments 

57 --------- 

58 loadpath: pathlike 

59 Path to where .pp/.nc files are located 

60 constraint: iris.Constraint | iris.ConstraintCombination, optional 

61 Constraints to filter data by. Defaults to unconstrained. 

62 filename_pattern: str, optional 

63 Unix shell-style glob pattern to match filenames to. Defaults to "*" 

64 

65 Returns 

66 ------- 

67 cubes: iris.cube.Cube 

68 Cube loaded 

69 

70 Raises 

71 ------ 

72 FileNotFoundError 

73 If the provided path does not exist 

74 ValueError 

75 If the constraint doesn't produce a single cube. 

76 """ 

77 cubes = read_cubes(loadpath, constraint, filename_pattern) 

78 # TODO: Fix coordinate name to enable full level and half level merging. 

79 # Check filtered cubes is a CubeList containing one cube. 

80 if len(cubes) == 1: 

81 return cubes[0] 

82 else: 

83 raise ValueError( 

84 f"Constraint doesn't produce single cube. {constraint}\n{cubes}" 

85 ) 

86 

87 

88def read_cubes( 

89 loadpath: Path, 

90 constraint: iris.Constraint = None, 

91 filename_pattern: str = "*", 

92 **kwargs, 

93) -> iris.cube.CubeList: 

94 """Read cubes from files. 

95 

96 Read operator that takes a path string (can include wildcards), and uses 

97 iris to load_cube all the cubes matching the constraint and return a 

98 CubeList object. 

99 

100 If the loaded data is split across multiple files, a filename_pattern can be 

101 specified to select the read files using Unix shell-style wildcards. In this 

102 case the loadpath should point to the directory containing the data. 

103 

104 Ensemble data can also be loaded. If it has a realization coordinate 

105 already, it will be directly used. If not, it will have its member number 

106 guessed from the filename, based on one of several common patterns. For 

107 example the pattern *emXX*, where XX is the realization. 

108 

109 Deterministic data will be loaded with a realization of 0, allowing it to be 

110 processed in the same way as ensemble data. 

111 

112 Data output by XIOS (such as LFRic) has its per-file metadata removed so 

113 that the cubes merge across files. 

114 

115 Arguments 

116 --------- 

117 loadpath: pathlike 

118 Path to where .pp/.nc files are located 

119 constraint: iris.Constraint | iris.ConstraintCombination, optional 

120 Constraints to filter data by. Defaults to unconstrained. 

121 filename_pattern: str, optional 

122 Unix shell-style glob pattern to match filenames to. Defaults to "*" 

123 

124 Returns 

125 ------- 

126 cubes: iris.cube.CubeList 

127 Cubes loaded 

128 

129 Raises 

130 ------ 

131 FileNotFoundError 

132 If the provided path does not exist 

133 """ 

134 input_files = _check_input_files(loadpath, filename_pattern) 

135 # A constraint of None lets everything be loaded. 

136 logging.debug("Constraint: %s", constraint) 

137 # Load the data, then reload with correct handling if it is ensemble data. 

138 callback = _create_callback(is_ensemble=False) 

139 cubes = iris.load(input_files, constraint, callback=callback) 

140 if _is_ensemble(cubes): 

141 callback = _create_callback(is_ensemble=True) 

142 cubes = iris.load(input_files, constraint, callback=callback) 

143 

144 # Merge cubes now metadata has been fixed. 

145 cubes.merge() 

146 

147 logging.debug("Loaded cubes: %s", cubes) 

148 if len(cubes) == 0: 

149 warnings.warn( 

150 "No cubes loaded, check your constraints!", NoDataWarning, stacklevel=2 

151 ) 

152 return cubes 

153 

154 

155def _is_ensemble(cubelist: iris.cube.CubeList) -> bool: 

156 """Test if a CubeList is likely to be ensemble data. 

157 

158 If cubes either have a realization dimension, or there are multiple files 

159 for the same time-step, we can assume it is ensemble data. 

160 """ 

161 unique_cubes = set() 

162 for cube in cubelist: 

163 # Ignore realization of 0, as that is given to deterministic data. 

164 if cube.coords("realization") and any(cube.coord("realization").points != 0): 

165 return True 

166 # Compare XML representation of cube structure check for duplicates. 

167 cube_content = cube.xml() 

168 if cube_content in unique_cubes: 

169 logging.info("Ensemble data loaded.") 

170 logging.debug("Cube Contents: %s", unique_cubes) 

171 return True 

172 else: 

173 unique_cubes.add(cube_content) 

174 logging.info("Deterministic data loaded.") 

175 logging.debug("Cube Contents: %s", unique_cubes) 

176 return False 

177 

178 

179def _create_callback(is_ensemble: bool) -> callable: 

180 """Compose together the needed callbacks into a single function.""" 

181 

182 def callback(cube: iris.cube.Cube, field, filename: str): 

183 if is_ensemble: 

184 _ensemble_callback(cube, field, filename) 

185 else: 

186 _deterministic_callback(cube, field, filename) 

187 _lfric_normalise_callback(cube, field, filename) 

188 

189 return callback 

190 

191 

192def _ensemble_callback(cube, field, filename): 

193 """Add a realization coordinate to a cube. 

194 

195 Uses the filename to add an ensemble member ('realization') to each cube. 

196 Assumes data is formatted enuk_um_0XX/enukaa_pd0HH.pp where XX is the 

197 ensemble member. 

198 

199 Arguments 

200 --------- 

201 cube: Cube 

202 ensemble member cube 

203 field 

204 Raw data variable, unused. 

205 filename: str 

206 filename of ensemble member data 

207 """ 

208 if not cube.coords("realization"): 

209 if "em" in filename: 

210 # Assuming format is *emXX* 

211 loc = filename.find("em") + 2 

212 member = np.int32(filename[loc : loc + 2]) 

213 else: 

214 # Assuming raw fields files format is enuk_um_0XX/enukaa_pd0HH 

215 member = np.int32(filename[-15:-13]) 

216 

217 cube.add_aux_coord(iris.coords.AuxCoord(member, standard_name="realization")) 

218 

219 

220def _deterministic_callback(cube, field, filename): 

221 """Give deterministic cubes a realization of 0. 

222 

223 This means they can be handled in the same way as ensembles through the rest 

224 of the code. 

225 """ 

226 # Only add if realization coordinate does not exist. 

227 if not cube.coords("realization"): 

228 cube.add_aux_coord( 

229 iris.coords.AuxCoord(np.int32(0), standard_name="realization", units="1") 

230 ) 

231 

232 

233def _lfric_normalise_callback(cube: iris.cube.Cube, field, filename): 

234 """Normalise attributes that prevents LFRic cube from merging. 

235 

236 The uuid and timeStamp relate to the output file, as saved by XIOS, and has 

237 no relation to the data contained. These attributes are removed. 

238 

239 The um_stash_source is a list of STASH codes for when an LFRic field maps to 

240 multiple UM fields, however it can be encoded in any order. This attribute 

241 is sorted to prevent this. This attribute is only present in LFRic data that 

242 has been converted to look like UM data. 

243 """ 

244 # Remove unwanted attributes. 

245 cube.attributes.pop("timeStamp", None) 

246 cube.attributes.pop("uuid", None) 

247 

248 # Sort STASH code list. 

249 stash_list = cube.attributes.get("um_stash_source") 

250 if stash_list: 

251 # Parse the string as a list, sort, then re-encode as a string. 

252 cube.attributes["um_stash_source"] = str(sorted(ast.literal_eval(stash_list))) 

253 

254 

255def _check_input_files(input_path: Path | str, filename_pattern: str) -> Iterable[Path]: 

256 """Get an iterable of files to load, and check that they all exist. 

257 

258 Arguments 

259 --------- 

260 input_path: Path | str 

261 Path to an input file or directory. The path may itself contain glob 

262 patterns, but unlike in shells it will match directly first. 

263 

264 filename_pattern: str 

265 Shell-style glob pattern to match inside the input directory. 

266 

267 Returns 

268 ------- 

269 Iterable[Path] 

270 A list of files to load. 

271 

272 Raises 

273 ------ 

274 FileNotFoundError: 

275 If the provided arguments don't resolve to at least one existing file. 

276 """ 

277 # Convert string paths into Path objects. 

278 if isinstance(input_path, str): 

279 input_path = Path(input_path) 

280 

281 # Get the list of files in the directory, or use it directly. Error if no 

282 # files found. 

283 if input_path.is_dir(): 

284 files = tuple(sorted(input_path.glob(filename_pattern))) 

285 if len(files) == 0: 

286 raise FileNotFoundError( 

287 f"No files found matching filename_pattern {filename_pattern} within {input_path}" 

288 ) 

289 elif input_path.is_file(): 

290 files = (input_path,) 

291 else: 

292 # Handle input_path containing a glob pattern. 

293 files = tuple(sorted(input_path.parent.glob(input_path.name))) 

294 if len(files) == 0: 

295 raise FileNotFoundError(f"{input_path} does not exist!") 

296 logging.info("Loading files:\n%s", "\n".join(str(path) for path in files)) 

297 return files