Coverage for src/CSET/operators/read.py: 100%

79 statements  

« prev     ^ index     » next       coverage.py v7.5.4, created at 2024-07-01 08:37 +0000

1# Copyright 2022-2023 Met Office and contributors. 

2# 

3# Licensed under the Apache License, Version 2.0 (the "License"); 

4# you may not use this file except in compliance with the License. 

5# You may obtain a copy of the License at 

6# 

7# http://www.apache.org/licenses/LICENSE-2.0 

8# 

9# Unless required by applicable law or agreed to in writing, software 

10# distributed under the License is distributed on an "AS IS" BASIS, 

11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 

12# See the License for the specific language governing permissions and 

13# limitations under the License. 

14 

15"""Operators for reading various types of files from disk.""" 

16 

17import ast 

18import logging 

19import warnings 

20from collections.abc import Iterable 

21from pathlib import Path 

22 

23import iris 

24import iris.coords 

25import iris.cube 

26import numpy as np 

27 

28 

29class NoDataWarning(UserWarning): 

30 """Warning that no data has been loaded.""" 

31 

32 

33def read_cube( 

34 loadpath: Path, 

35 constraint: iris.Constraint = None, 

36 filename_pattern: str = "*", 

37 **kwargs, 

38) -> iris.cube.Cube: 

39 """Read a single cube from files. 

40 

41 Read operator that takes a path string (can include wildcards), and uses 

42 iris to load the cube matching the constraint. 

43 

44 If the loaded data is split across multiple files, a filename_pattern can be 

45 specified to select the read files using Unix shell-style wildcards. In this 

46 case the loadpath should point to the directory containing the data. 

47 

48 Ensemble data can also be loaded. If it has a realization coordinate 

49 already, it will be directly used. If not, it will have its member number 

50 guessed from the filename, based on one of several common patterns. For 

51 example the pattern *emXX*, where XX is the realization. 

52 

53 Deterministic data will be loaded with a realization of 0, allowing it to be 

54 processed in the same way as ensemble data. 

55 

56 Arguments 

57 --------- 

58 loadpath: pathlike 

59 Path to where .pp/.nc files are located 

60 constraint: iris.Constraint | iris.ConstraintCombination, optional 

61 Constraints to filter data by. Defaults to unconstrained. 

62 filename_pattern: str, optional 

63 Unix shell-style glob pattern to match filenames to. Defaults to "*" 

64 

65 Returns 

66 ------- 

67 cubes: iris.cube.Cube 

68 Cube loaded 

69 

70 Raises 

71 ------ 

72 FileNotFoundError 

73 If the provided path does not exist 

74 ValueError 

75 If the constraint doesn't produce a single cube. 

76 """ 

77 cubes = read_cubes(loadpath, constraint, filename_pattern) 

78 # Check filtered cubes is a CubeList containing one cube. 

79 if len(cubes) == 1: 

80 return cubes[0] 

81 else: 

82 raise ValueError( 

83 f"Constraint doesn't produce single cube. {constraint}\n{cubes}" 

84 ) 

85 

86 

87def read_cubes( 

88 loadpath: Path, 

89 constraint: iris.Constraint = None, 

90 filename_pattern: str = "*", 

91 **kwargs, 

92) -> iris.cube.CubeList: 

93 """Read cubes from files. 

94 

95 Read operator that takes a path string (can include wildcards), and uses 

96 iris to load_cube all the cubes matching the constraint and return a 

97 CubeList object. 

98 

99 If the loaded data is split across multiple files, a filename_pattern can be 

100 specified to select the read files using Unix shell-style wildcards. In this 

101 case the loadpath should point to the directory containing the data. 

102 

103 Ensemble data can also be loaded. If it has a realization coordinate 

104 already, it will be directly used. If not, it will have its member number 

105 guessed from the filename, based on one of several common patterns. For 

106 example the pattern *emXX*, where XX is the realization. 

107 

108 Deterministic data will be loaded with a realization of 0, allowing it to be 

109 processed in the same way as ensemble data. 

110 

111 Data output by XIOS (such as LFRic) has its per-file metadata removed so 

112 that the cubes merge across files. 

113 

114 Arguments 

115 --------- 

116 loadpath: pathlike 

117 Path to where .pp/.nc files are located 

118 constraint: iris.Constraint | iris.ConstraintCombination, optional 

119 Constraints to filter data by. Defaults to unconstrained. 

120 filename_pattern: str, optional 

121 Unix shell-style glob pattern to match filenames to. Defaults to "*" 

122 

123 Returns 

124 ------- 

125 cubes: iris.cube.CubeList 

126 Cubes loaded 

127 

128 Raises 

129 ------ 

130 FileNotFoundError 

131 If the provided path does not exist 

132 """ 

133 input_files = _check_input_files(loadpath, filename_pattern) 

134 # A constraint of None lets everything be loaded. 

135 logging.debug("Constraint: %s", constraint) 

136 # Load the data, then reload with correct handling if it is ensemble data. 

137 callback = _create_callback(is_ensemble=False) 

138 cubes = iris.load(input_files, constraint, callback=callback) 

139 if _is_ensemble(cubes): 

140 callback = _create_callback(is_ensemble=True) 

141 cubes = iris.load(input_files, constraint, callback=callback) 

142 

143 # Merge cubes now metadata has been fixed. 

144 cubes.merge() 

145 

146 logging.debug("Loaded cubes: %s", cubes) 

147 if len(cubes) == 0: 

148 warnings.warn( 

149 "No cubes loaded, check your constraints!", NoDataWarning, stacklevel=2 

150 ) 

151 return cubes 

152 

153 

154def _is_ensemble(cubelist: iris.cube.CubeList) -> bool: 

155 """Test if a CubeList is likely to be ensemble data. 

156 

157 If cubes either have a realization dimension, or there are multiple files 

158 for the same time-step, we can assume it is ensemble data. 

159 """ 

160 unique_cubes = set() 

161 for cube in cubelist: 

162 # Ignore realization of 0, as that is given to deterministic data. 

163 if cube.coords("realization") and any(cube.coord("realization").points != 0): 

164 return True 

165 # Compare XML representation of cube structure check for duplicates. 

166 cube_content = cube.xml() 

167 if cube_content in unique_cubes: 

168 logging.info("Ensemble data loaded.") 

169 logging.debug("Cube Contents: %s", unique_cubes) 

170 return True 

171 else: 

172 unique_cubes.add(cube_content) 

173 logging.info("Deterministic data loaded.") 

174 logging.debug("Cube Contents: %s", unique_cubes) 

175 return False 

176 

177 

178def _create_callback(is_ensemble: bool) -> callable: 

179 """Compose together the needed callbacks into a single function.""" 

180 

181 def callback(cube: iris.cube.Cube, field, filename: str): 

182 if is_ensemble: 

183 _ensemble_callback(cube, field, filename) 

184 else: 

185 _deterministic_callback(cube, field, filename) 

186 _lfric_normalise_callback(cube, field, filename) 

187 

188 return callback 

189 

190 

191def _ensemble_callback(cube, field, filename): 

192 """Add a realization coordinate to a cube. 

193 

194 Uses the filename to add an ensemble member ('realization') to each cube. 

195 Assumes data is formatted enuk_um_0XX/enukaa_pd0HH.pp where XX is the 

196 ensemble member. 

197 

198 Arguments 

199 --------- 

200 cube: Cube 

201 ensemble member cube 

202 field 

203 Raw data variable, unused. 

204 filename: str 

205 filename of ensemble member data 

206 """ 

207 if not cube.coords("realization"): 

208 if "em" in filename: 

209 # Assuming format is *emXX* 

210 loc = filename.find("em") + 2 

211 member = np.int32(filename[loc : loc + 2]) 

212 else: 

213 # Assuming raw fields files format is enuk_um_0XX/enukaa_pd0HH 

214 member = np.int32(filename[-15:-13]) 

215 

216 cube.add_aux_coord(iris.coords.AuxCoord(member, standard_name="realization")) 

217 

218 

219def _deterministic_callback(cube, field, filename): 

220 """Give deterministic cubes a realization of 0. 

221 

222 This means they can be handled in the same way as ensembles through the rest 

223 of the code. 

224 """ 

225 # Only add if realization coordinate does not exist. 

226 if not cube.coords("realization"): 

227 cube.add_aux_coord( 

228 iris.coords.AuxCoord(np.int32(0), standard_name="realization", units="1") 

229 ) 

230 

231 

232def _lfric_normalise_callback(cube: iris.cube.Cube, field, filename): 

233 """Normalise attributes that prevents LFRic cube from merging. 

234 

235 The uuid and timeStamp relate to the output file, as saved by XIOS, and has 

236 no relation to the data contained. These attributes are removed. 

237 

238 The um_stash_source is a list of STASH codes for when an LFRic field maps to 

239 multiple UM fields, however it can be encoded in any order. This attribute 

240 is sorted to prevent this. This attribute is only present in LFRic data that 

241 has been converted to look like UM data. 

242 """ 

243 # Remove unwanted attributes. 

244 cube.attributes.pop("timeStamp", None) 

245 cube.attributes.pop("uuid", None) 

246 

247 # Sort STASH code list. 

248 stash_list = cube.attributes.get("um_stash_source") 

249 if stash_list: 

250 # Parse the string as a list, sort, then re-encode as a string. 

251 cube.attributes["um_stash_source"] = str(sorted(ast.literal_eval(stash_list))) 

252 

253 

254def _check_input_files(input_path: Path | str, filename_pattern: str) -> Iterable[Path]: 

255 """Get an iterable of files to load, and check that they all exist. 

256 

257 Arguments 

258 --------- 

259 input_path: Path | str 

260 Path to an input file or directory. The path may itself contain glob 

261 patterns, but unlike in shells it will match directly first. 

262 

263 filename_pattern: str 

264 Shell-style glob pattern to match inside the input directory. 

265 

266 Returns 

267 ------- 

268 Iterable[Path] 

269 A list of files to load. 

270 

271 Raises 

272 ------ 

273 FileNotFoundError: 

274 If the provided arguments don't resolve to at least one existing file. 

275 """ 

276 # Convert string paths into Path objects. 

277 if isinstance(input_path, str): 

278 input_path = Path(input_path) 

279 

280 # Get the list of files in the directory, or use it directly. Error if no 

281 # files found. 

282 if input_path.is_dir(): 

283 files = tuple(sorted(input_path.glob(filename_pattern))) 

284 if len(files) == 0: 

285 raise FileNotFoundError( 

286 f"No files found matching filename_pattern {filename_pattern} within {input_path}" 

287 ) 

288 elif input_path.is_file(): 

289 files = (input_path,) 

290 else: 

291 # Handle input_path containing a glob pattern. 

292 files = tuple(sorted(input_path.parent.glob(input_path.name))) 

293 if len(files) == 0: 

294 raise FileNotFoundError(f"{input_path} does not exist!") 

295 logging.info("Loading files:\n%s", "\n".join(str(path) for path in files)) 

296 return files