"""Provides the core IO facilities supported by PyARPES.
The most important here are the data loading functions (load_data, load_example_data).
and pickling utilities.
Heavy lifting is actually performed by the plugin definitions which know how to ingest
different data formats into the PyARPES data model.
TODO: An improvement could be made to the example data if served
over a network and someone was willing to host a few larger pieces
of data.
"""
from __future__ import annotations
import copy
import pickle
import warnings
from collections.abc import Iterable
from dataclasses import dataclass
from logging import DEBUG, INFO
from pathlib import Path
from typing import TYPE_CHECKING, Literal, cast
import numpy as np
import pandas as pd
import xarray as xr
from .configuration.interface import get_workspace_path
from .debug import setup_logger
from .endstations.registry import resolve_endstation
from .example_data.mock import build_mock_tarpes
from .provenance import provenance_multiple_parents
if TYPE_CHECKING:
from _typeshed import Incomplete
from arpes._typing.attrs_property import ScanDesc
from arpes._typing.base import XrTypes
__all__ = ("easy_pickle", "list_pickles", "load_data", "load_example_data", "load_scan", "stitch")
LOGLEVELS = (DEBUG, INFO)
LOGLEVEL = LOGLEVELS[1]
logger = setup_logger(__name__, LOGLEVEL)
[docs]
def load_data(
file: str | Path,
location: str | None = None,
**kwargs: Incomplete,
) -> xr.Dataset:
"""Loads a piece of data using available plugins. This the user facing API for data loading.
Args:
file (str | Path): An identifier for the file which should be loaded, i.e., the file path.
location (str | type[EndstationBase]): The name of the endstation/plugin to use.
You should try to provide one. If None is provided, the loader
will try to find an appropriate one based on the file extension and brute force.
This will be slower and can be error prone in certain circumstances.
kwargs: pass to load_scan
Optionally, you can pass a loading plugin (the class) through this kwarg and directly
specify the class to be used.
Returns:
The loaded data. Ideally, data which is loaded through the plugin system should be highly
compliant with the PyARPES data model and should work seamlessly with PyARPES analysis code.
"""
try:
file = int(str(file)) # type: ignore[assignment] # pragma: no cover
warnings.warn(
"This functionality, the data specified by number, will be removed.",
DeprecationWarning,
stacklevel=2,
)
except ValueError:
assert isinstance(file, (str | Path))
file = str(Path(file).absolute())
desc: ScanDesc = {
"file": file, # type:ignore[typeddict-item]
"location": location, # type:ignore[typeddict-item]
}
if location is None:
desc.pop("location")
warnings.warn(
(
"You should provide a location indicating the endstation or "
"instrument used directly en loading data without a dataset."
"We are going to do our best but no guarantees."
),
stacklevel=2,
)
logger.debug(f"contents of desc: {desc}")
return load_scan(desc, **kwargs)
DATA_EXAMPLES: dict[str, tuple[str, str]] = {
"cut": ("ALG-MC", "cut.fits"),
"map": ("example_data", "fermi_surface.nc"),
"photon_energy": ("example_data", "photon_energy.nc"),
"nano_xps": ("example_data", "nano_xps.nc"),
"temperature_dependence": ("example_data", "temperature_dependence.nc"),
"cut2": ("SPD", "example_itx_data.itx"),
"cut3": ("DSNP_UMCS", "BLGr_K_cut.xy"),
"map2": ("DSNP_UMCS", "BLGr_GK_map.xy"),
}
[docs]
def load_example_data(example_name: str = "cut") -> xr.Dataset:
"""Provides sample data for executable documentation.
Args:
example_name: (cut, cut2, cut3, map, map2, photon_energy, nano_xps, temperature_dependence)
Returns:
example DataSet
"""
if example_name not in DATA_EXAMPLES:
msg = f"Could not find requested example_name: {example_name}."
msg += f"Please provide one of {list(DATA_EXAMPLES.keys())}"
raise KeyError(msg)
location, example = DATA_EXAMPLES[example_name]
logger.debug(f"location:{location}")
file = Path(__file__).parent / "example_data" / example
return load_data(file=file, location=location)
@dataclass
class ExampleData:
@property
def cut(self) -> xr.Dataset:
return load_example_data("cut")
@property
def map(self) -> xr.Dataset:
return load_example_data("map")
@property
def photon_energy(self) -> xr.Dataset:
return load_example_data("photon_energy")
@property
def nano_xps(self) -> xr.Dataset:
return load_example_data("nano_xps")
@property
def temperature_dependence(self) -> xr.Dataset:
return load_example_data("temperature_dependence")
@property
def cut2(self) -> xr.Dataset:
return load_example_data("cut2")
@property
def cut3(self) -> xr.Dataset:
return load_example_data("cut3")
@property
def map2(self) -> xr.Dataset:
return load_example_data("map2")
@property
def t_arpes(self) -> list[xr.DataArray]:
return build_mock_tarpes()
example_data = ExampleData()
def stitch(
df_or_list: list[str] | pd.DataFrame,
attr_or_axis: str | list[float] | tuple[float, ...],
built_axis_name: str = "",
*,
sort: bool = True,
) -> XrTypes:
"""Stitches together a sequence of scans or a DataFrame.
Args:
df_or_list(list[str] | pd.DataFrame): The list of the files to load
attr_or_axis(str|list[float]|tuple[float, ...]): Coordinate or attribute in order to
promote to an index. I.e. if 't_a' is specified, we will create a new axis
corresponding to the temperature and concatenate the data along this axis
built_axis_name: The name of the concatenated output dimensions
sort: Whether to sort inputs to the concatenation according to their `attr_or_axis` value.
Returns:
The concatenated data.
"""
list_of_files = _df_or_list_to_files(df_or_list)
if not built_axis_name:
assert isinstance(attr_or_axis, str)
built_axis_name = attr_or_axis
if not list_of_files:
msg = "Must supply at least one file to stitch"
raise ValueError(msg)
loaded: list[xr.Dataset] = []
i = 0
for f in list_of_files:
data: xr.Dataset = load_data(f)
value: xr.DataArray | float | None = None
if isinstance(attr_or_axis, list | tuple):
value = attr_or_axis[i]
elif attr_or_axis in data.attrs:
value = data.attrs[attr_or_axis]
elif attr_or_axis in data.coords:
value = data.coords[attr_or_axis]
loaded.append(data.assign_coords({built_axis_name: value}))
assert all(isinstance(data, xr.DataArray) for data in loaded) or all(
isinstance(data, xr.Dataset) for data in loaded
)
if sort:
loaded.sort(key=lambda x: np.min(x.coords[built_axis_name].values))
assert isinstance(loaded, Iterable)
concatenated = xr.concat(loaded, dim=built_axis_name)
if "id" in concatenated.attrs:
del concatenated.attrs["id"]
provenance_multiple_parents(
concatenated,
loaded,
{
"what": "Stitched together separate datasets",
"by": "stitch",
"dim": built_axis_name,
},
)
return concatenated
def _df_or_list_to_files(
df_or_list: list[str] | pd.DataFrame,
) -> list[str]:
"""Helper function for stitch.
Args:
df_or_list(pd.DataFrame, list): input data file
Returns: (list[str])
list of files to stitch.
"""
if isinstance(df_or_list, pd.DataFrame):
return list(df_or_list.index)
assert not isinstance(
df_or_list,
list | tuple,
), "Expected an iterable for a list of the scans to stitch together"
return list(df_or_list)
def file_for_pickle(name: str) -> Path | str:
here = Path()
if get_workspace_path():
here = Path(get_workspace_path())
path = here / "picklejar" / f"{name}.pickle"
path.parent.mkdir(exist_ok=True)
return str(path)
def load_pickle(name: str) -> object:
"""Loads a workspace local pickle. Inverse to `save_pickle`."""
with Path(file_for_pickle(name)).open("rb") as file:
return pickle.load(file) # noqa: S301
def save_pickle(data: object, name: str) -> None:
"""Saves a workspace local pickle. Inverse to `load_pickle`."""
with Path(file_for_pickle(name)).open("wb") as pickle_file:
pickle.dump(data, pickle_file)
def easy_pickle(data_or_str: str | object, name: str = "") -> object:
"""A convenience function around pickling.
Provides a workspace scoped associative set of named pickles which
can be used for
Examples:
Retaining analysis results between sessions.
Sharing results between workspaces.
Caching expensive or interim work.
For reproducibility reasons, you should generally prefer to
duplicate anaysis results using common code to prevent stale data
dependencies, but there are good reasons to use pickling as well.
This function knows whether we are pickling or unpickling depending on
whether one or two arguments are provided.
Args:
data_or_str: If saving, the data to be pickled. If loading, the name of the pickle to load.
name: If saving (non-None value), the name to associate. Defaults to None.
Returns:
None if name is not None, which indicates that we are saving data.
Otherwise, returns the unpickled value associated to `name`.
"""
# we are loading data
if isinstance(data_or_str, str) or not name:
assert isinstance(data_or_str, str)
return load_pickle(data_or_str)
# we are saving data
assert isinstance(name, str)
save_pickle(data_or_str, name)
return None
def list_pickles() -> list[str]:
"""Generates a summary list of (workspace-local) pickled results and data.
Returns:
A list of the named pickles, suitable for passing to `easy_pickle`.
"""
return [str(s.stem) for s in Path(file_for_pickle("just-a-pickle")).parent.glob("*.pickle")]
def load_scan(
scan_desc: ScanDesc,
*,
retry: bool = True,
**kwargs: Incomplete,
) -> xr.Dataset:
"""Resolves a plugin and delegates loading a scan.
This is used internally by `load_data` and should not be invoked directly
by users.
Determines which data loading class is appropriate for the data,
shuffles a bit of metadata, and calls the .load function on the
retrieved class to start the data loading process.
Args:
scan_desc: Information identifying the scan, typically the full path.
retry: Used to attempt a reload of plugins and subsequent data load attempt.
kwargs: pass to the endstation.load(scan_dec, **kwargs)
Returns:
Loaded and normalized ARPES scan data.
"""
note: dict[str, str | float] | ScanDesc = scan_desc.get("note", scan_desc)
full_note: ScanDesc = copy.deepcopy(scan_desc)
assert isinstance(note, dict)
full_note.update(cast("ScanDesc", note))
endstation_cls = resolve_endstation(retry=retry, **full_note)
logger.debug(f"Using plugin class {endstation_cls}")
key: Literal["file", "path"] = "file" if "file" in scan_desc else "path"
file = scan_desc[key]
try:
file_number: int = int(str(file))
file = endstation_cls.find_first_file(file_number)
scan_desc[key] = file
except ValueError:
pass
logger.debug(f"Loading {scan_desc}")
endstation = endstation_cls()
return endstation.load(scan_desc, **kwargs)