Source code for resistics_readers.lemi.b423e

"""
Module for Lemi B423E data
Lemi B423E always records channels in the following order

- E1, E2, E3, E4

The Lemi B423E binary files are constructed from a 1024 text header followed
by repeating records with the following definitions (PPS is short for
deviation from PPS and PLL is short for PLL accuracy):

SECOND_TIMESTAMP, SAMPLE_NUM [0-FS], E1, E2, E3, E4, PPS, PLL

These are interpreted to have byte types

L, H, l, l, l, l, h, h

The channels labels are not that useful for magnetotelluric projects. When
creating metadata it is best to map them to standard channels such as Ex or Ey
"""
from loguru import logger
from typing import List, Optional, Tuple
from pathlib import Path
import numpy as np
import pandas as pd
from resistics.time import TimeMetadata, TimeData

from resistics_readers.multifile import validate_consistency, validate_continuous
from resistics_readers.multifile import TimeMetadataMerge
from resistics_readers.lemi.b423 import TimeMetadataB423, TimeReaderB423

B423E_CHANS = ["E1", "E2", "E3", "E4"]
B423E_RECORD_BYTES = 26
B423E_HEADER_LENGTH = 1024
B423E_MULT = ["Ke1", "Ke2", "Ke3", "Ke4"]
B423E_ADD = ["Ae1", "Ae2", "Ae3", "Ae4"]


[docs]def make_sudbir_B423E_metadata(
    dir_path: Path,
    fs: float,
    chans: Optional[List[str]] = None,
    dx: float = 1,
    dy: float = 1,
    folders: Optional[List[str]] = None,
) -> None:
    """
    Generate metadata for every subdirectory in a folder

    Parameters
    ----------
    dir_path : Path
        Root directory
    fs : float
        Sampling frequency, Hz
    chans : Optional[List[str]], optional
        The channels, by default None
    dx : float, optional
        Dipole distance Ex, by default 1
    dy : float, optional
        Dipole distance Ey, by default 1
    folders : Optional[List[str]], optional
        An optional list of subfolders, by default None. If None, all the
        subfolders will be processed
    """
    from resistics.common import dir_subdirs

    if chans is None:
        chans = B423E_CHANS

    if folders is None:
        folders = dir_subdirs(dir_path)
    else:
        folders = [dir_path / folder for folder in folders]
    for folder in folders:
        make_B423E_metadata(folder, fs, chans=chans, dx=dx, dy=dy)


[docs]def make_B423E_metadata(
    dir_path: Path,
    fs: float,
    chans: Optional[List[str]] = None,
    dx: float = 1,
    dy: float = 1,
) -> None:
    """
    Read a single B423E measurement directory, construct and write out metadata

    Parameters
    ----------
    dir_path : Path
        Directory path
    fs : float
        The sampling frequency, Hz
    chans : Optional[List[str]], optional
        Optional list of chans, by default None
    dx : float, optional
        Dipole distance Ex, by default 1
    dy : float, optional
        Dipole distance Ey, by default 1
    """
    metadata_list = _get_B423E_metadata_list(dir_path, fs, chans)
    validate_consistency(dir_path, metadata_list)
    validate_continuous(dir_path, metadata_list)
    metadata = _merge_metadata(metadata_list, dx, dy)
    logger.info(f"Writing metadata in {dir_path}")
    metadata.write(dir_path / "metadata.json")


def _get_B423E_metadata_list(
    dir_path: Path, fs: float, chans: Optional[List[str]] = None
) -> List[TimeMetadataB423]:
    """
    Get list of TimeMetadataB423, one for each data file

    Parameters
    ----------
    dir_path : Path
        The data path
    fs : float
        The sampling frequency, Hz
    chans : Optional[List[str]]
        The channels, by default None. Standard channels are E1, E2, E3 and E4,
        but these have little meaning geophysically, so it is possible to
        set different labels for the four channels

    Returns
    -------
    List[TimeMetadataB423]
        List of TimeMetadata
    """
    from resistics_readers.lemi.b423 import _read_B423_headers

    if chans is None:
        chans = B423E_CHANS
    data_paths = list(dir_path.glob("*.B423"))
    metadata_list = []
    for data_path in data_paths:
        logger.debug(f"Reading data file {data_path}")
        metadata_B423E = _read_B423_headers(
            data_path,
            fs,
            data_byte_offset=B423E_HEADER_LENGTH,
            record_bytes=B423E_RECORD_BYTES,
            chans=chans,
        )
        metadata_list.append(metadata_B423E)
    return metadata_list


def _merge_metadata(
    metadata_list: List[TimeMetadataB423], dx: float = 1, dy: float = 1
) -> TimeMetadata:
    """Merge the metadata list into a TimeMetadata"""
    metadata = TimeMetadata(**metadata_list[0].dict())
    metadata.first_time = min([x.first_time for x in metadata_list])
    metadata.last_time = max([x.last_time for x in metadata_list])
    metadata.n_samples = np.sum([x.n_samples for x in metadata_list])

    # channel headers
    data_files = [x.data_file for x in metadata_list]
    for chan in metadata.chans:
        metadata.chans_metadata[chan].data_files = data_files
        if chan == "Ex":
            metadata.chans_metadata[chan].dipole_dist = dx
        if chan == "Ey":
            metadata.chans_metadata[chan].dipole_dist = dy
    return metadata


[docs]class TimeReaderB423E(TimeReaderB423):
    """
    Data reader for Lemi B423E data

    There is no separate metadata file for Lemi B423E data detailing the
    sampling frequency, the number of samples, the sensors etc.. Such a
    metadata file is a pre-requisite for resistics. There are helper methods to
    make one.

    In situations where a Lemi B423E dataset is recorded in multiple files, it
    is required that the recording is continuous.

    Other important notes about Lemi B423E files

    - 1024 bytes of ASCII metadata in the data file with scaling information
    - Lemi B423E raw measurement data is signed long integer format

    Important points about scalings

    - Raw data is integer counts for electric channels
    - Scalings in B423E files convert electric channels to uV (microvolt)

    If apply_scaling is False, data will be returned in:

    - microvolts for the electric channels

    Which is equivalent to applying the scalings in the B423E headers

    With apply_scaling True, the following additional scaling will be applied:

    - Electric channels converted to mV
    - Dipole length corrections are applied to electric channels

    .. note::

        For more information about Lemi B423 format, please see:
        http://lemisensors.com/?p=485
    """

    record_bytes: int = B423E_RECORD_BYTES
    limit_chans: Optional[List[str]] = None

[docs]    def read_metadata(self, dir_path: Path) -> TimeMetadataMerge:
        """
        Read metadata

        Parameters
        ----------
        dir_path : Path
            The data directory

        Returns
        -------
        TimeMetadataMerge
            TimeMetadata with a data table

        Raises
        ------
        MetadataReadError
            If the number of channels is incorrect
        TimeDataReadError
            If not all data files exist
        TimeDataReadError
            If extensions do not match
        """
        from resistics.errors import MetadataReadError, TimeDataReadError
        from resistics_readers.multifile import validate_consistency
        from resistics_readers.multifile import validate_continuous

        metadata = TimeMetadata.parse_file(dir_path / "metadata.json")
        if metadata.n_chans != len(B423E_CHANS):
            raise MetadataReadError(
                dir_path, f"Number channels {metadata.chans} != {len(B423E_CHANS)}"
            )
        metadata_list = _get_B423E_metadata_list(dir_path, metadata.fs)
        validate_consistency(dir_path, metadata_list)
        validate_continuous(dir_path, metadata_list)
        data_table = self._generate_table(metadata_list)
        metadata_dict = metadata.dict()
        metadata_dict["data_table"] = data_table.to_dict()
        metadata = TimeMetadataMerge(**metadata_dict)

        if not self._check_data_files(dir_path, metadata):
            raise TimeDataReadError(dir_path, "All data files do not exist")
        if not self._check_extensions(dir_path, metadata):
            raise TimeDataReadError(dir_path, f"Data file suffix not {self.extension}")
        return metadata

[docs]    def read_data(
        self, dir_path: Path, metadata: TimeMetadata, read_from: int, read_to: int
    ) -> TimeData:
        """
        Get data from data files

        Lemi B423E data always has four channels, in order E1, E2, E3, E4. When
        writing out the metadata, it is possible to relabel these channels, for
        example:

        - Ex, Ey, E3, E4

        The raw data is integer counts. However, additional scalings from the
        B423 files are applied to give:

        - microvolts for the electric channels
        - millivolts for the magnetic with the gain applied

        The scalings are as follows:

        - Channel 1 = (Channel 1 * Ke1) + Ae1
        - Channel 2 = (Channel 1 * Ke2) + Ae2
        - Channel 3 = (Channel 1 * Ke3) + Ae3
        - Channel 4 = (Channel 1 * Ke4) + Ae4

        Unlike most other readers, the channels returned can be explicity
        selected by setting the limit_chans attribute of the class. This is
        because in many cases, only two of the B423Echannels are actually
        useful.

        Parameters
        ----------
        dir_path : path
            The directory path to read from
        metadata : TimeMetadata
            Time series data metadata
        read_from : int
            Sample to read data from
        read_to : int
            Sample to read data to

        Returns
        -------
        TimeData
            Time data object
        """
        from resistics_readers.multifile import samples_to_sources

        dtype = np.float32
        n_samples = read_to - read_from + 1
        chans, chan_indices = self._get_chans(dir_path, metadata)
        n_chans = len(chans)
        logger.info(f"Reading channels {chans}, indices {chan_indices}")

        messages = [f"Reading raw data from {dir_path}"]
        messages.append(f"Sampling rate {metadata.fs} Hz")
        # loop over B423 files and read data
        data_table = pd.DataFrame(data=metadata.data_table)
        df_to_read = samples_to_sources(dir_path, data_table, read_from, read_to)
        data = np.empty(shape=(n_chans, n_samples), dtype=dtype)
        sample = 0
        for data_file, info in df_to_read.iterrows():
            file_from = info.loc["read_from"]
            file_to = info.loc["read_to"]
            n_samples_file = info.loc["n_samples_read"]
            data_byte_start = info.loc["data_byte_start"]
            mult = np.array([info.loc[B423E_MULT[idx]] for idx in chan_indices])
            add = np.array([info.loc[B423E_ADD[idx]] for idx in chan_indices])

            messages.append(f"{data_file}: Reading samples {file_from} to {file_to}")
            logger.debug(f"{data_file}: Reading samples {file_from} to {file_to}")
            byte_read_start = data_byte_start + file_from * self.record_bytes
            n_bytes_to_read = n_samples_file * self.record_bytes
            with (dir_path / data_file).open("rb") as f:
                f.seek(byte_read_start, 0)
                data_bytes = f.read(n_bytes_to_read)

            data_read = self._parse_records(metadata.chans, data_bytes)
            data_read = (data_read[chan_indices] * mult[:, None]) + add[:, None]
            data[:, sample : sample + n_samples_file] = data_read
            sample = sample + n_samples_file
        metadata = self._adjust_metadata_chans(metadata, chans)
        metadata = self._get_return_metadata(metadata, read_from, read_to)
        messages.append(f"From sample, time: {read_from}, {str(metadata.first_time)}")
        messages.append(f"To sample, time: {read_to}, {str(metadata.last_time)}")
        metadata.history.add_record(self._get_record(messages))
        logger.info(f"Data successfully read from {dir_path}")
        return TimeData(metadata, data)

    def _get_chans(
        self, dir_path: Path, metadata: TimeMetadataMerge
    ) -> Tuple[List[str], List[int]]:
        """
        Get the channels to read and their indices

        This is mainly used because it is likely that not all the channels are
        used or recording anything worthwhile

        Parameters
        ----------
        dir_path : Path
            The directory path
        metadata : TimeMetadataMerge
            The metadata for the recording

        Returns
        -------
        Tuple[List[str], List[int]]
            The channels and the channel indices

        Raises
        ------
        TimeDataReadError
            If any of limit_chans are not in the metadata.chans
        """
        from resistics.errors import TimeDataReadError

        if self.limit_chans is None:
            return metadata.chans, list(range(metadata.n_chans))
        chk = set(self.limit_chans) - set(metadata.chans)
        if len(chk) > 0:
            raise TimeDataReadError(
                dir_path, f"Chans {chk} not in metadata {metadata.chans}"
            )
        indices = [metadata.chans.index(x) for x in self.limit_chans]
        return self.limit_chans, indices

    def _adjust_metadata_chans(
        self, metadata: TimeMetadataMerge, chans: List[str]
    ) -> TimeMetadataMerge:
        """Adjust the channels in the metadata"""
        metadata.chans_metadata = {c: metadata.chans_metadata[c] for c in chans}
        metadata.chans = chans
        metadata.n_chans = len(chans)
        return metadata

[docs]    def scale_data(self, time_data: TimeData) -> TimeData:
        """
        Get data scaled to physical values

        resistics uses field units, meaning physical samples will return the
        following:

        - Electrical channels in mV/km
        - Magnetic channels in mV
        - To get magnetic fields in nT, calibration needs to be performed

        Notes
        -----
        When Lemi data is read in, scaling in the headers is applied. Therefore,
        the magnetic channels is in mV with gain applied and the electric
        channels are in uV (microvolts). To complete the scaling to field units,
        the below additional corrections need to be applied.

        Electric channels need to divided by 1000 along with dipole length
        division in km (east-west spacing and north-south spacing) to return
        mV/km.

        Magnetic channels need to be divided by the internal gain value which
        should be set in the metadata

        Parameters
        ----------
        time_data : TimeData
            Input time data

        Returns
        -------
        TimeData
            Time data in field units
        """
        logger.info("Applying scaling to data to give field units")
        messages = ["Scaling raw data to physical units"]
        for chan in time_data.metadata.chans:
            chan_metadata = time_data.metadata.chans_metadata[chan]
            time_data[chan] = time_data[chan] / 1000.0
            messages.append(f"Dividing chan {chan} by 1000 to convert from uV to mV")
            dipole_dist_km = chan_metadata.dipole_dist / 1_000
            time_data[chan] = time_data[chan] / dipole_dist_km
            messages.append(f"Dividing {chan} by dipole length {dipole_dist_km} km")
        record = self._get_record(messages)
        time_data.metadata.history.add_record(record)
        return time_data