Coverage for src/CSET/operators/read.py: 100%
79 statements
« prev ^ index » next coverage.py v7.5.4, created at 2024-07-01 08:37 +0000
« prev ^ index » next coverage.py v7.5.4, created at 2024-07-01 08:37 +0000
1# Copyright 2022-2023 Met Office and contributors.
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7# http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
15"""Operators for reading various types of files from disk."""
17import ast
18import logging
19import warnings
20from collections.abc import Iterable
21from pathlib import Path
23import iris
24import iris.coords
25import iris.cube
26import numpy as np
29class NoDataWarning(UserWarning):
30 """Warning that no data has been loaded."""
33def read_cube(
34 loadpath: Path,
35 constraint: iris.Constraint = None,
36 filename_pattern: str = "*",
37 **kwargs,
38) -> iris.cube.Cube:
39 """Read a single cube from files.
41 Read operator that takes a path string (can include wildcards), and uses
42 iris to load the cube matching the constraint.
44 If the loaded data is split across multiple files, a filename_pattern can be
45 specified to select the read files using Unix shell-style wildcards. In this
46 case the loadpath should point to the directory containing the data.
48 Ensemble data can also be loaded. If it has a realization coordinate
49 already, it will be directly used. If not, it will have its member number
50 guessed from the filename, based on one of several common patterns. For
51 example the pattern *emXX*, where XX is the realization.
53 Deterministic data will be loaded with a realization of 0, allowing it to be
54 processed in the same way as ensemble data.
56 Arguments
57 ---------
58 loadpath: pathlike
59 Path to where .pp/.nc files are located
60 constraint: iris.Constraint | iris.ConstraintCombination, optional
61 Constraints to filter data by. Defaults to unconstrained.
62 filename_pattern: str, optional
63 Unix shell-style glob pattern to match filenames to. Defaults to "*"
65 Returns
66 -------
67 cubes: iris.cube.Cube
68 Cube loaded
70 Raises
71 ------
72 FileNotFoundError
73 If the provided path does not exist
74 ValueError
75 If the constraint doesn't produce a single cube.
76 """
77 cubes = read_cubes(loadpath, constraint, filename_pattern)
78 # Check filtered cubes is a CubeList containing one cube.
79 if len(cubes) == 1:
80 return cubes[0]
81 else:
82 raise ValueError(
83 f"Constraint doesn't produce single cube. {constraint}\n{cubes}"
84 )
87def read_cubes(
88 loadpath: Path,
89 constraint: iris.Constraint = None,
90 filename_pattern: str = "*",
91 **kwargs,
92) -> iris.cube.CubeList:
93 """Read cubes from files.
95 Read operator that takes a path string (can include wildcards), and uses
96 iris to load_cube all the cubes matching the constraint and return a
97 CubeList object.
99 If the loaded data is split across multiple files, a filename_pattern can be
100 specified to select the read files using Unix shell-style wildcards. In this
101 case the loadpath should point to the directory containing the data.
103 Ensemble data can also be loaded. If it has a realization coordinate
104 already, it will be directly used. If not, it will have its member number
105 guessed from the filename, based on one of several common patterns. For
106 example the pattern *emXX*, where XX is the realization.
108 Deterministic data will be loaded with a realization of 0, allowing it to be
109 processed in the same way as ensemble data.
111 Data output by XIOS (such as LFRic) has its per-file metadata removed so
112 that the cubes merge across files.
114 Arguments
115 ---------
116 loadpath: pathlike
117 Path to where .pp/.nc files are located
118 constraint: iris.Constraint | iris.ConstraintCombination, optional
119 Constraints to filter data by. Defaults to unconstrained.
120 filename_pattern: str, optional
121 Unix shell-style glob pattern to match filenames to. Defaults to "*"
123 Returns
124 -------
125 cubes: iris.cube.CubeList
126 Cubes loaded
128 Raises
129 ------
130 FileNotFoundError
131 If the provided path does not exist
132 """
133 input_files = _check_input_files(loadpath, filename_pattern)
134 # A constraint of None lets everything be loaded.
135 logging.debug("Constraint: %s", constraint)
136 # Load the data, then reload with correct handling if it is ensemble data.
137 callback = _create_callback(is_ensemble=False)
138 cubes = iris.load(input_files, constraint, callback=callback)
139 if _is_ensemble(cubes):
140 callback = _create_callback(is_ensemble=True)
141 cubes = iris.load(input_files, constraint, callback=callback)
143 # Merge cubes now metadata has been fixed.
144 cubes.merge()
146 logging.debug("Loaded cubes: %s", cubes)
147 if len(cubes) == 0:
148 warnings.warn(
149 "No cubes loaded, check your constraints!", NoDataWarning, stacklevel=2
150 )
151 return cubes
154def _is_ensemble(cubelist: iris.cube.CubeList) -> bool:
155 """Test if a CubeList is likely to be ensemble data.
157 If cubes either have a realization dimension, or there are multiple files
158 for the same time-step, we can assume it is ensemble data.
159 """
160 unique_cubes = set()
161 for cube in cubelist:
162 # Ignore realization of 0, as that is given to deterministic data.
163 if cube.coords("realization") and any(cube.coord("realization").points != 0):
164 return True
165 # Compare XML representation of cube structure check for duplicates.
166 cube_content = cube.xml()
167 if cube_content in unique_cubes:
168 logging.info("Ensemble data loaded.")
169 logging.debug("Cube Contents: %s", unique_cubes)
170 return True
171 else:
172 unique_cubes.add(cube_content)
173 logging.info("Deterministic data loaded.")
174 logging.debug("Cube Contents: %s", unique_cubes)
175 return False
178def _create_callback(is_ensemble: bool) -> callable:
179 """Compose together the needed callbacks into a single function."""
181 def callback(cube: iris.cube.Cube, field, filename: str):
182 if is_ensemble:
183 _ensemble_callback(cube, field, filename)
184 else:
185 _deterministic_callback(cube, field, filename)
186 _lfric_normalise_callback(cube, field, filename)
188 return callback
191def _ensemble_callback(cube, field, filename):
192 """Add a realization coordinate to a cube.
194 Uses the filename to add an ensemble member ('realization') to each cube.
195 Assumes data is formatted enuk_um_0XX/enukaa_pd0HH.pp where XX is the
196 ensemble member.
198 Arguments
199 ---------
200 cube: Cube
201 ensemble member cube
202 field
203 Raw data variable, unused.
204 filename: str
205 filename of ensemble member data
206 """
207 if not cube.coords("realization"):
208 if "em" in filename:
209 # Assuming format is *emXX*
210 loc = filename.find("em") + 2
211 member = np.int32(filename[loc : loc + 2])
212 else:
213 # Assuming raw fields files format is enuk_um_0XX/enukaa_pd0HH
214 member = np.int32(filename[-15:-13])
216 cube.add_aux_coord(iris.coords.AuxCoord(member, standard_name="realization"))
219def _deterministic_callback(cube, field, filename):
220 """Give deterministic cubes a realization of 0.
222 This means they can be handled in the same way as ensembles through the rest
223 of the code.
224 """
225 # Only add if realization coordinate does not exist.
226 if not cube.coords("realization"):
227 cube.add_aux_coord(
228 iris.coords.AuxCoord(np.int32(0), standard_name="realization", units="1")
229 )
232def _lfric_normalise_callback(cube: iris.cube.Cube, field, filename):
233 """Normalise attributes that prevents LFRic cube from merging.
235 The uuid and timeStamp relate to the output file, as saved by XIOS, and has
236 no relation to the data contained. These attributes are removed.
238 The um_stash_source is a list of STASH codes for when an LFRic field maps to
239 multiple UM fields, however it can be encoded in any order. This attribute
240 is sorted to prevent this. This attribute is only present in LFRic data that
241 has been converted to look like UM data.
242 """
243 # Remove unwanted attributes.
244 cube.attributes.pop("timeStamp", None)
245 cube.attributes.pop("uuid", None)
247 # Sort STASH code list.
248 stash_list = cube.attributes.get("um_stash_source")
249 if stash_list:
250 # Parse the string as a list, sort, then re-encode as a string.
251 cube.attributes["um_stash_source"] = str(sorted(ast.literal_eval(stash_list)))
254def _check_input_files(input_path: Path | str, filename_pattern: str) -> Iterable[Path]:
255 """Get an iterable of files to load, and check that they all exist.
257 Arguments
258 ---------
259 input_path: Path | str
260 Path to an input file or directory. The path may itself contain glob
261 patterns, but unlike in shells it will match directly first.
263 filename_pattern: str
264 Shell-style glob pattern to match inside the input directory.
266 Returns
267 -------
268 Iterable[Path]
269 A list of files to load.
271 Raises
272 ------
273 FileNotFoundError:
274 If the provided arguments don't resolve to at least one existing file.
275 """
276 # Convert string paths into Path objects.
277 if isinstance(input_path, str):
278 input_path = Path(input_path)
280 # Get the list of files in the directory, or use it directly. Error if no
281 # files found.
282 if input_path.is_dir():
283 files = tuple(sorted(input_path.glob(filename_pattern)))
284 if len(files) == 0:
285 raise FileNotFoundError(
286 f"No files found matching filename_pattern {filename_pattern} within {input_path}"
287 )
288 elif input_path.is_file():
289 files = (input_path,)
290 else:
291 # Handle input_path containing a glob pattern.
292 files = tuple(sorted(input_path.parent.glob(input_path.name)))
293 if len(files) == 0:
294 raise FileNotFoundError(f"{input_path} does not exist!")
295 logging.info("Loading files:\n%s", "\n".join(str(path) for path in files))
296 return files