Coverage for src/CSET/operators/read.py: 100%
79 statements
« prev ^ index » next coverage.py v7.5.4, created at 2024-07-01 10:31 +0000
« prev ^ index » next coverage.py v7.5.4, created at 2024-07-01 10:31 +0000
1# Copyright 2022-2023 Met Office and contributors.
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7# http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
15"""Operators for reading various types of files from disk."""
17import ast
18import logging
19import warnings
20from collections.abc import Iterable
21from pathlib import Path
23import iris
24import iris.coords
25import iris.cube
26import numpy as np
29class NoDataWarning(UserWarning):
30 """Warning that no data has been loaded."""
33def read_cube(
34 loadpath: Path,
35 constraint: iris.Constraint = None,
36 filename_pattern: str = "*",
37 **kwargs,
38) -> iris.cube.Cube:
39 """Read a single cube from files.
41 Read operator that takes a path string (can include wildcards), and uses
42 iris to load the cube matching the constraint.
44 If the loaded data is split across multiple files, a filename_pattern can be
45 specified to select the read files using Unix shell-style wildcards. In this
46 case the loadpath should point to the directory containing the data.
48 Ensemble data can also be loaded. If it has a realization coordinate
49 already, it will be directly used. If not, it will have its member number
50 guessed from the filename, based on one of several common patterns. For
51 example the pattern *emXX*, where XX is the realization.
53 Deterministic data will be loaded with a realization of 0, allowing it to be
54 processed in the same way as ensemble data.
56 Arguments
57 ---------
58 loadpath: pathlike
59 Path to where .pp/.nc files are located
60 constraint: iris.Constraint | iris.ConstraintCombination, optional
61 Constraints to filter data by. Defaults to unconstrained.
62 filename_pattern: str, optional
63 Unix shell-style glob pattern to match filenames to. Defaults to "*"
65 Returns
66 -------
67 cubes: iris.cube.Cube
68 Cube loaded
70 Raises
71 ------
72 FileNotFoundError
73 If the provided path does not exist
74 ValueError
75 If the constraint doesn't produce a single cube.
76 """
77 cubes = read_cubes(loadpath, constraint, filename_pattern)
78 # TODO: Fix coordinate name to enable full level and half level merging.
79 # Check filtered cubes is a CubeList containing one cube.
80 if len(cubes) == 1:
81 return cubes[0]
82 else:
83 raise ValueError(
84 f"Constraint doesn't produce single cube. {constraint}\n{cubes}"
85 )
88def read_cubes(
89 loadpath: Path,
90 constraint: iris.Constraint = None,
91 filename_pattern: str = "*",
92 **kwargs,
93) -> iris.cube.CubeList:
94 """Read cubes from files.
96 Read operator that takes a path string (can include wildcards), and uses
97 iris to load_cube all the cubes matching the constraint and return a
98 CubeList object.
100 If the loaded data is split across multiple files, a filename_pattern can be
101 specified to select the read files using Unix shell-style wildcards. In this
102 case the loadpath should point to the directory containing the data.
104 Ensemble data can also be loaded. If it has a realization coordinate
105 already, it will be directly used. If not, it will have its member number
106 guessed from the filename, based on one of several common patterns. For
107 example the pattern *emXX*, where XX is the realization.
109 Deterministic data will be loaded with a realization of 0, allowing it to be
110 processed in the same way as ensemble data.
112 Data output by XIOS (such as LFRic) has its per-file metadata removed so
113 that the cubes merge across files.
115 Arguments
116 ---------
117 loadpath: pathlike
118 Path to where .pp/.nc files are located
119 constraint: iris.Constraint | iris.ConstraintCombination, optional
120 Constraints to filter data by. Defaults to unconstrained.
121 filename_pattern: str, optional
122 Unix shell-style glob pattern to match filenames to. Defaults to "*"
124 Returns
125 -------
126 cubes: iris.cube.CubeList
127 Cubes loaded
129 Raises
130 ------
131 FileNotFoundError
132 If the provided path does not exist
133 """
134 input_files = _check_input_files(loadpath, filename_pattern)
135 # A constraint of None lets everything be loaded.
136 logging.debug("Constraint: %s", constraint)
137 # Load the data, then reload with correct handling if it is ensemble data.
138 callback = _create_callback(is_ensemble=False)
139 cubes = iris.load(input_files, constraint, callback=callback)
140 if _is_ensemble(cubes):
141 callback = _create_callback(is_ensemble=True)
142 cubes = iris.load(input_files, constraint, callback=callback)
144 # Merge cubes now metadata has been fixed.
145 cubes.merge()
147 logging.debug("Loaded cubes: %s", cubes)
148 if len(cubes) == 0:
149 warnings.warn(
150 "No cubes loaded, check your constraints!", NoDataWarning, stacklevel=2
151 )
152 return cubes
155def _is_ensemble(cubelist: iris.cube.CubeList) -> bool:
156 """Test if a CubeList is likely to be ensemble data.
158 If cubes either have a realization dimension, or there are multiple files
159 for the same time-step, we can assume it is ensemble data.
160 """
161 unique_cubes = set()
162 for cube in cubelist:
163 # Ignore realization of 0, as that is given to deterministic data.
164 if cube.coords("realization") and any(cube.coord("realization").points != 0):
165 return True
166 # Compare XML representation of cube structure check for duplicates.
167 cube_content = cube.xml()
168 if cube_content in unique_cubes:
169 logging.info("Ensemble data loaded.")
170 logging.debug("Cube Contents: %s", unique_cubes)
171 return True
172 else:
173 unique_cubes.add(cube_content)
174 logging.info("Deterministic data loaded.")
175 logging.debug("Cube Contents: %s", unique_cubes)
176 return False
179def _create_callback(is_ensemble: bool) -> callable:
180 """Compose together the needed callbacks into a single function."""
182 def callback(cube: iris.cube.Cube, field, filename: str):
183 if is_ensemble:
184 _ensemble_callback(cube, field, filename)
185 else:
186 _deterministic_callback(cube, field, filename)
187 _lfric_normalise_callback(cube, field, filename)
189 return callback
192def _ensemble_callback(cube, field, filename):
193 """Add a realization coordinate to a cube.
195 Uses the filename to add an ensemble member ('realization') to each cube.
196 Assumes data is formatted enuk_um_0XX/enukaa_pd0HH.pp where XX is the
197 ensemble member.
199 Arguments
200 ---------
201 cube: Cube
202 ensemble member cube
203 field
204 Raw data variable, unused.
205 filename: str
206 filename of ensemble member data
207 """
208 if not cube.coords("realization"):
209 if "em" in filename:
210 # Assuming format is *emXX*
211 loc = filename.find("em") + 2
212 member = np.int32(filename[loc : loc + 2])
213 else:
214 # Assuming raw fields files format is enuk_um_0XX/enukaa_pd0HH
215 member = np.int32(filename[-15:-13])
217 cube.add_aux_coord(iris.coords.AuxCoord(member, standard_name="realization"))
220def _deterministic_callback(cube, field, filename):
221 """Give deterministic cubes a realization of 0.
223 This means they can be handled in the same way as ensembles through the rest
224 of the code.
225 """
226 # Only add if realization coordinate does not exist.
227 if not cube.coords("realization"):
228 cube.add_aux_coord(
229 iris.coords.AuxCoord(np.int32(0), standard_name="realization", units="1")
230 )
233def _lfric_normalise_callback(cube: iris.cube.Cube, field, filename):
234 """Normalise attributes that prevents LFRic cube from merging.
236 The uuid and timeStamp relate to the output file, as saved by XIOS, and has
237 no relation to the data contained. These attributes are removed.
239 The um_stash_source is a list of STASH codes for when an LFRic field maps to
240 multiple UM fields, however it can be encoded in any order. This attribute
241 is sorted to prevent this. This attribute is only present in LFRic data that
242 has been converted to look like UM data.
243 """
244 # Remove unwanted attributes.
245 cube.attributes.pop("timeStamp", None)
246 cube.attributes.pop("uuid", None)
248 # Sort STASH code list.
249 stash_list = cube.attributes.get("um_stash_source")
250 if stash_list:
251 # Parse the string as a list, sort, then re-encode as a string.
252 cube.attributes["um_stash_source"] = str(sorted(ast.literal_eval(stash_list)))
255def _check_input_files(input_path: Path | str, filename_pattern: str) -> Iterable[Path]:
256 """Get an iterable of files to load, and check that they all exist.
258 Arguments
259 ---------
260 input_path: Path | str
261 Path to an input file or directory. The path may itself contain glob
262 patterns, but unlike in shells it will match directly first.
264 filename_pattern: str
265 Shell-style glob pattern to match inside the input directory.
267 Returns
268 -------
269 Iterable[Path]
270 A list of files to load.
272 Raises
273 ------
274 FileNotFoundError:
275 If the provided arguments don't resolve to at least one existing file.
276 """
277 # Convert string paths into Path objects.
278 if isinstance(input_path, str):
279 input_path = Path(input_path)
281 # Get the list of files in the directory, or use it directly. Error if no
282 # files found.
283 if input_path.is_dir():
284 files = tuple(sorted(input_path.glob(filename_pattern)))
285 if len(files) == 0:
286 raise FileNotFoundError(
287 f"No files found matching filename_pattern {filename_pattern} within {input_path}"
288 )
289 elif input_path.is_file():
290 files = (input_path,)
291 else:
292 # Handle input_path containing a glob pattern.
293 files = tuple(sorted(input_path.parent.glob(input_path.name)))
294 if len(files) == 0:
295 raise FileNotFoundError(f"{input_path} does not exist!")
296 logging.info("Loading files:\n%s", "\n".join(str(path) for path in files))
297 return files