diff --git a/.github/workflows/python-test.yml b/.github/workflows/python-test.yml new file mode 100644 index 0000000..34be1c9 --- /dev/null +++ b/.github/workflows/python-test.yml @@ -0,0 +1,44 @@ +name: Python Test + +on: [push, pull_request] + +jobs: + build: + if: env.SKIP_BUILD != 'true' + runs-on: ubuntu-latest + timeout-minutes: 15 + + strategy: + matrix: + python-version: ['3.9', '3.10', '3.11'] + + steps: + - name: Checkout Repository + uses: actions/checkout@v2 + + - name: Install Miniconda + run: | + wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh + bash miniconda.sh -b -p $HOME/miniconda + source "$HOME/miniconda/etc/profile.d/conda.sh" + conda config --set always_yes yes --set changeps1 no + conda update -q conda + conda info -a + + - name: Install Mamba + run: | + source "$HOME/miniconda/etc/profile.d/conda.sh" + conda install mamba -c conda-forge + + - name: Create Environment and Install Dependencies + run: | + source "$HOME/miniconda/etc/profile.d/conda.sh" + mamba create -q -n test-environment python=${{ matrix.python-version }} + conda activate test-environment + mamba install pytest -c conda-forge + + - name: Run Tests + run: | + source "$HOME/miniconda/etc/profile.d/conda.sh" + conda activate test-environment + pytest diff --git a/.gitignore b/.gitignore index 2e1b830..91060e5 100644 --- a/.gitignore +++ b/.gitignore @@ -22,6 +22,7 @@ var/ wheels/ share/python-wheels/ *.egg-info/ +notebooks/ .installed.cfg *.egg MANIFEST @@ -158,4 +159,12 @@ cython_debug/ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore # and can be added to the global gitignore or merged into this file. For a more nuclear # option (not recommended) you can uncomment the following to ignore the entire idea folder. -#.idea/ +.idea/ +# Data files +test_data/ +*.nc +*.zarr +*.raw +# For MAC stuff +.DS_Store +oceanstream/.DS_Store diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index e63b471..fb6fc0a 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,8 +1,8 @@ exclude: | (?x)^( - echopype/tests/| - echopype/test_data/| - echopype/visualize/| + tests/| + test_data/| + oceanstream/visualize/| docs/source/conf.py ) repos: @@ -37,6 +37,6 @@ repos: rev: v2.2.5 hooks: - id: codespell - # Checks spelling in `docs/source` and `echopype` dirs ONLY + # Checks spelling in `docs/source` and `oceanstream` dirs ONLY # Ignores `.ipynb` files and `_build` folders - args: ["--skip=*.ipynb,docs/source/_build,echopype/test_data", "-w", "docs/source", "echopype"] + args: ["--skip=*.ipynb,docs/source/_build,oceanstream/test_data", "-w", "docs/source", "oceanstream"] diff --git a/docs/source/L0.rst b/docs/source/L0.rst index 4524fcc..a6cfcc4 100644 --- a/docs/source/L0.rst +++ b/docs/source/L0.rst @@ -1,5 +1,5 @@ L0_unprocessed_data module ========================== -.. automodule:: oceanstream.L0_unprocessed_data.template +.. automodule:: oceanstream.L0_unprocessed_data.raw_reader :members: diff --git a/docs/source/index.rst b/docs/source/index.rst index 4d48ffb..6fed438 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -3,7 +3,7 @@ Oceanstream Documentation =========================== .. toctree:: - :maxdepth: 2 + :maxdepth: 1 :caption: Contents: introduction diff --git a/notebooks/README.md b/notebooks/README.md new file mode 100644 index 0000000..2d8fa98 --- /dev/null +++ b/notebooks/README.md @@ -0,0 +1 @@ +# oceanstream diff --git a/oceanstream/L0_unprocessed_data/raw_reader.py b/oceanstream/L0_unprocessed_data/raw_reader.py new file mode 100644 index 0000000..1524be8 --- /dev/null +++ b/oceanstream/L0_unprocessed_data/raw_reader.py @@ -0,0 +1,443 @@ +""" +raw_reader.py +------------- +Module for reading, verifying, and converting echo sounder raw data files. + +This module provides functionalities to: + +- Search for raw echo sounder files within specified directories or paths. +- Verify the integrity of these files, ensuring they are readable by echopype. +- Extract essential metadata from the files, such as campaign ID,\ +date of measurement, and sonar model. +- Convert raw files to specified formats (netCDF or zarr) and\ +save them to a desired location. +- Group similar files based on specific criteria, such as campaign ID,\ +sonar model, and time difference. + +The module supports various sonar models including +EK60, ES70, EK80, EA640, AZFP, and AD2CP. +It also defines a time threshold for determining +the similarity between two consecutive files. + +""" + +# Import necessary libraries +import os +import re +from datetime import datetime, timedelta +from typing import Dict, List, Union + +import echopype as ep + +SUPPORTED_SONAR_MODELS = ["EK60", "ES70", "EK80", "EA640", "AZFP", "AD2CP"] +TIME_BETWEEN_FILES = 30 # time in minutes between two consecutive files + + +def file_finder( + paths: Union[str, List[str]], file_type: str = "raw" +) -> List[str]: # noqa: E501 + """ + Finds and returns all files of a specified type from given paths. + + This function searches for files of a specified type (e.g., "raw") + within the provided paths. + It can search within a single directory or + across multiple specified file paths. + + Parameters: + + - paths (str or list[str]): If a string is provided,\ + it should be the absolute path to a directory.\ + If a list is provided,\ + it should contain absolute paths to individual files. + - file_type (str, optional): The type of files to search for.\ + Defaults to "raw". + + Returns: + + - list[str]: A sorted list of all found files of the specified type. + + Raises: + + - ValueError: If the provided paths input is neither a directory\ + nor a list of file paths. + + Example: + + file_finder("/path/to/directory") + ['/path/to/directory/file1.raw', '/path/to/directory/file2.raw'] + + file_finder(["/path/to/file1.raw", "/path/to/file2.raw"]) + ['/path/to/file1.raw', '/path/to/file2.raw'] + """ + if isinstance(paths, str) and os.path.isdir(paths): + ret_files = [ + os.path.join(paths, f_p) + for f_p in os.listdir(paths) + if os.path.isfile(os.path.join(paths, f_p)) + ] + ret_files = file_finder(ret_files, file_type) + elif isinstance(paths, list): + ret_files = [] + for elem in paths: + if "." + file_type in elem and os.path.isfile(elem): + ret_files.append(elem) + else: + raise ValueError( + "Invalid input. Provide either a directory\ + path or a list of file paths." + ) + + return sorted(ret_files) + + +def file_integrity_checking( + file_path: str, +) -> Dict[str, Union[str, datetime, bool]]: # noqa: E501 + """ + Checks the integrity of a given echo sounder file. + + This function verifies if the provided echo sounder file is + readable by echopype and extracts + essential metadata such as the campaign ID, date of measurement, + and sonar model. The function + supports raw, netCDF, and zarr file formats. + + Parameters: + + - file_path (str): Absolute path to the echo sounder file. + + Returns: + + - dict: A dictionary containing the following keys: + 'file_path': Absolute path to the file. + 'campaign_id': Identifier for the measuring\ + campaign extracted from the file name. + 'date': Date and time when the measurement started,\ + extracted from the file name. + 'sonar_model': Type of sonar that produced the file. + 'file_integrity': Boolean indicating if the file is readable by echopype. + + Raises: + + - Exception: If the file type is not supported or + if there are issues reading the file. + + Example: + + file_integrity_checking("/path/to/JR161-D20230509-T100645.raw") + { + 'file_path': '/path/to/JR161-D20230509-T100645.raw', + 'campaign_id': 'JR161', + 'date': datetime.datetime(2023, 5, 9, 10, 6, 45), + 'sonar_model': 'EK60', + 'file_integrity': True + } + """ + return_dict = {} + # get file name from path + file_name = os.path.split(file_path)[-1] + # eliminate file type + file_name = file_name.split(".")[0] + campaign_id = file_name.split("-")[0] + no_date_from_file_name = False + date = datetime.now() + try: + pattern_date = r"D(\d{4})(\d{2})(\d{2})" + pattern_time = r"T(\d{2})(\d{2})(\d{2})" + + matches_date = re.findall(pattern_date, file_name)[0] + matches_time = re.findall(pattern_time, file_name)[0] + + year, month, day = matches_date + hour, minute, second = matches_time + + datetime_string = f"D{year}{month}{day}-T{hour}{minute}{second}" + date = datetime.strptime(datetime_string, "D%Y%m%d-T%H%M%S") + except Exception as e: + e += "!" + no_date_from_file_name = True + + if ".raw" in file_path: + for s_m in SUPPORTED_SONAR_MODELS: + try: + ed = ep.open_raw(file_path, sonar_model=s_m) # type: ignore + file_integrity = True + break + except ValueError: + continue + else: + raise Exception("File type not supported for " + str(file_path)) + elif ".nc" or ".zarr" in file_path: + try: + ed = ep.open_converted(file_path) + file_integrity = True + except ValueError: + raise Exception("File type not supported for " + str(file_path)) + else: + raise Exception("File type not supported for " + str(file_path)) + if no_date_from_file_name: + datetime_string = ed["Top-level"].date_created + date = datetime.strptime(datetime_string, "%Y-%m-%dT%H:%M:%SZ") + + return_dict["file_path"] = file_path + return_dict["campaign_id"] = campaign_id + return_dict["date"] = date + return_dict["sonar_model"] = ed.sonar_model + return_dict["file_integrity"] = file_integrity + return return_dict + + +def read_raw_files( + file_dicts: List[Dict[str, Union[str, datetime, bool]]] +) -> List[ep.EchoData]: + """ + Reads multiple raw echo sounder files and returns a list of Datasets. + + This function processes a list of file information dictionaries, + opens each raw file + using the specified sonar model, + and returns the corresponding datasets. + + Parameters: + + - file_dicts (list of dict): List of dictionaries, \ + each containing file information \ + as provided by the file_integrity_checking function. + + Returns: + + - list: List of EchoData datasets corresponding to each raw file. + + """ + ret_list = [] + for f_i in file_dicts: + opened_file = _read_file(f_i["file_path"], f_i["sonar_model"]) + ret_list.append(opened_file) + return ret_list + + +def read_processed_files(file_paths: List[str]) -> List[ep.EchoData]: + """ + Reads multiple processed echo sounder files and returns a list of Datasets. + + This function processes a list of file paths, opens each processed file, + and returns the corresponding datasets. + + Parameters: + + - file_paths (list of str): List of file paths\ + to processed echo sounder files. + + Returns: + + - list: List of EchoData datasets\ + corresponding to each processed file. + + """ + ret_list = [] + for file_path in file_paths: + opened_file = _read_file(file_path) + ret_list.append(opened_file) + return ret_list + + +def _read_file(file_path: str, sonar_model: str = "EK80") -> ep.EchoData: + """ + Reads an echo sounder file and + returns the corresponding Dataset. + + This function determines the type of the file + (raw, netCDF, or zarr) based on its + extension and opens it using echopype. + For raw files, the sonar model must be specified. + + Parameters: + + - file_path (str): Absolute path to the echo sounder file. + - sonar_model (str, optional): Type of sonar model. Defaults to "EK80".\ + Relevant only for raw files. + + Returns: + + - EchoData: Dataset corresponding to the provided file. + + Raises: + + - Exception: If the file type is not supported by echopype. + + """ + file_name = os.path.split(file_path)[-1] + if ".raw" in file_name: + ed = ep.open_raw(file_path, sonar_model=sonar_model) # type: ignore + elif ".nc" in file_name or ".zarr" in file_name: + ed = ep.open_converted(file_path) # create an EchoData object + else: + raise Exception("File not supported by echopype.") + return ed + + +def convert_raw_files( + file_dicts: List[Dict[str, Union[str, datetime, bool]]], + save_path: str = "", + save_file_type: str = "nc", +) -> List[str]: + """ + Converts multiple raw echo sounder files to the + specified file type and saves them. + + This function processes a list of file information dictionaries, + converts each raw file + to the specified file type (netCDF or zarr), + and saves the converted files to the given path. + + Parameters: + + - file_dicts (list of dict): List of dictionaries,\ + each containing file information. + - save_path (str): Directory path where\ + the converted files will be saved. + - save_file_type (str): Desired file type\ + for saving the converted files.\ + Options are 'nc' or 'zarr'. + + Returns: + + - list: List of paths to the saved converted files. + + """ + ret_list = [] + for f_i in file_dicts: + opened_file = _read_file(f_i["file_path"], f_i["sonar_model"]) + _write_file(opened_file, save_path, save_file_type) + file_name = os.path.split(f_i["file_path"])[-1] + file_type = save_file_type + new_file_name = file_name.replace("raw", file_type) + ret_list.append(os.path.join(save_path, new_file_name)) + return ret_list + + +def _write_file( + ed: ep.EchoData, + save_path: str, + save_file_type: str = "nc", + overwrite: bool = False, # noqa: E501 +) -> str: + """ + Writes an echo sounder dataset to a + specified file type and saves it. + + This function takes an EchoData dataset, + converts it to the specified file type + (netCDF or zarr), and saves the file to the provided path. + + Parameters: + + - ed (EchoData): echo sounder dataset to be saved. + - save_path (str): Directory path where the dataset will be saved. + - save_file_type (str, optional): Desired file type\ + for saving the dataset. Defaults to 'nc'.\ + Options are 'nc' or 'zarr'. + - overwrite (bool, optional): If True, overwrites\ + the file if it already exists. Defaults to False. + + Returns: + + - str: Path to the saved file. + + Raises: + + - Exception: If the specified file type is not supported by echopype. + + """ + if save_file_type == "nc": + ed.to_netcdf(save_path=save_path, overwrite=overwrite) + elif save_file_type == "zarr": + ed.to_zarr(save_path=save_path, overwrite=overwrite) + else: + raise Exception("File type not supported echopype.") + return save_path + + +def _is_similar( + file_dict1: Dict[str, Union[str, datetime, bool]], + file_dict2: Dict[str, Union[str, datetime, bool]], +) -> bool: + """ + Determines if two file information dictionaries + are similar based on specific criteria. + + This function checks if two file dictionaries + have the same campaign ID, sonar model, + file integrity, and if their date difference + is within a specified time range. + + Parameters: + + - file_dict1 (dict): First file information dictionary. + - file_dict2 (dict): Second file information dictionary. + + Returns: + + - bool: True if the file dictionaries are similar\ + based on the criteria, False otherwise. + + """ + if file_dict1["campaign_id"] != file_dict2["campaign_id"]: + return False + if file_dict1["sonar_model"] != file_dict2["sonar_model"]: + return False + if file_dict1["file_integrity"] != file_dict2["file_integrity"]: + return False + date_diff = file_dict1["date"] - file_dict2["date"] + if date_diff > timedelta(minutes=TIME_BETWEEN_FILES): + return False + return True + + +def split_files( + file_dicts: List[Dict[str, Union[str, datetime, bool]]] +) -> List[List[Dict[str, Union[str, datetime, bool]]]]: + """ + Splits a list of file information dictionaries + into sublists based on their similarity. + + This function processes a list of file information + dictionaries and groups them into + sublists where each sublist contains files + that are similar to each other based on + specific criteria. + + Parameters: + + - file_dicts (list of dict): List of file information dictionaries. + + Returns: + + - list of lists: List containing sublists of file dictionaries\ + grouped by their similarity. + + """ + list_of_lists = [] + + temp_list = [] + prev_elem = file_dicts[0] + for elem in file_dicts: + if _is_similar(elem, prev_elem): + temp_list.append(elem) + else: + list_of_lists.append(temp_list) + temp_list = [elem] + prev_elem = elem + list_of_lists.append(temp_list) + return list_of_lists + + +def concatenate_files( + file_dicts: List[Dict[str, Union[str, datetime, bool]]] +) -> ep.EchoData: + list_of_datasets = [] + for file_info in file_dicts: + list_of_datasets.append(_read_file(file_info["file_path"])) + combined_dataset = ep.combine_echodata(list_of_datasets) + return combined_dataset diff --git a/requirements.txt b/requirements.txt index 4c35bdf..e157c6f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -41,3 +41,4 @@ sphinx_rtd_theme sphinxcontrib-mermaid twine wheel +echopype diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..3fe3d4c --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,53 @@ +import os +from ftplib import FTP + +import pytest + + +current_directory = os.path.dirname(os.path.abspath(__file__)) +TEST_DATA_FOLDER = os.path.join(current_directory, "..", "test_data", "ek60") + +FTP_MAIN = "ftp.bas.ac.uk" +FTP_PARTIAL_PATH = "rapidkrill/ek60/" + + +def download_ftp_directory(ftp, remote_path, local_path): + try: + os.makedirs(local_path, exist_ok=True) + items = ftp.nlst(remote_path) + + for item in items: + local_item_path = os.path.join(local_path, os.path.basename(item)) + if is_directory(ftp, item): + download_ftp_directory(ftp, item, local_item_path) + else: + # Check if the file already exists locally + if not os.path.exists(local_item_path): + with open(local_item_path, "wb") as local_file: + ftp.retrbinary("RETR " + item, local_file.write) + else: + # print(f"File {local_item_path} already exists. Skipping download.") + continue + + except Exception as e: + print(f"Error downloading {remote_path}. Error: {e}") + + +def is_directory(ftp, name): + try: + current = ftp.pwd() + ftp.cwd(name) + ftp.cwd(current) + return True + except: + return False + + +@pytest.fixture(scope="session") +def ftp_data(): + with FTP(FTP_MAIN) as ftp: + ftp.login() # Add credentials if needed: ftp.login(user="username", passwd="password") + download_ftp_directory(ftp, FTP_PARTIAL_PATH, TEST_DATA_FOLDER) + yield TEST_DATA_FOLDER + # Optional: Cleanup after tests are done + # shutil.rmtree(TEST_DATA_FOLDER) \ No newline at end of file diff --git a/tests/test_raw_reader.py b/tests/test_raw_reader.py new file mode 100644 index 0000000..eb416ee --- /dev/null +++ b/tests/test_raw_reader.py @@ -0,0 +1,175 @@ +import os +from ftplib import FTP + +import pytest + +from oceanstream.L0_unprocessed_data.raw_reader import ( + concatenate_files, + convert_raw_files, + file_finder, + file_integrity_checking, + read_processed_files, + read_raw_files, + split_files, +) + +current_directory = os.path.dirname(os.path.abspath(__file__)) +TEST_DATA_FOLDER = os.path.join(current_directory, "..", "test_data", "ek60") + + +def test_file_finder(ftp_data): + # Test with a valid directory path containing files + found_files = file_finder(ftp_data) + assert ( + len(found_files) > 0 + ) # Assuming there's at least one file in the FTP directory + assert all([os.path.isfile(f) for f in found_files]) + + # Test with a list of valid file paths + assert file_finder(found_files, "raw") == found_files + + # Test with a directory containing no files of the specified type + # Assuming there are no ".txt" files in the FTP directory + assert file_finder(ftp_data, "txt") == [] + + # Test with a list containing invalid file paths + invalid_path = os.path.join(ftp_data, "invalid_file.raw") + assert file_finder([found_files[0], invalid_path], "raw") == [found_files[0]] + + # Test with an invalid path (neither directory nor list of file paths) + with pytest.raises(ValueError): + file_finder(12345) + + +def test_file_integrity_checking(ftp_data): + found_files = file_finder(ftp_data) + # Test with a valid raw echo sounder file + result_files = file_integrity_checking(found_files[0]) + assert result_files["file_integrity"] == True + assert result_files["sonar_model"] in [ + "EK60", + "ES70", + "EK80", + "EA640", + "AZFP", + "AD2CP", + ] + + # Test with a valid netCDF file + valid_netcdf_file = convert_raw_files( + [result_files], save_path=TEST_DATA_FOLDER, save_file_type="nc" + )[0] + result = file_integrity_checking(valid_netcdf_file) + assert result["file_integrity"] == True + + # Test with a valid zarr file + valid_zarr_file = convert_raw_files( + [result_files], save_path=TEST_DATA_FOLDER, save_file_type="zarr" + )[0] + result = file_integrity_checking(valid_zarr_file) + assert result["file_integrity"] == True + + # Test with an unsupported file type + unsupported_file = file_finder(ftp_data, "png")[0] + with pytest.raises(Exception, match="File type not supported"): + file_integrity_checking(unsupported_file) + + +def test_read_raw_files(ftp_data): + # Test with a list of valid file dictionaries + found_files = file_finder(ftp_data, "raw") + file_dicts = [file_integrity_checking(f) for f in found_files] + + datasets = read_raw_files(file_dicts) + assert len(datasets) == 17 + # Additional assertions can be added based on expected dataset properties + + # Test with an empty list + datasets = read_raw_files([]) + assert len(datasets) == 0 + + +def test_read_processed_files(ftp_data): + # Test with a list of valid processed file paths + found_files = file_finder(ftp_data, "raw") + file_dicts = [file_integrity_checking(f) for f in found_files[:3]] + file_paths = convert_raw_files( + file_dicts, save_path=TEST_DATA_FOLDER, save_file_type="nc" + ) + + datasets = read_processed_files(file_paths) + assert len(datasets) == 3 + # Additional assertions can be added based on expected dataset properties + + # Test with an empty list + datasets = read_processed_files([]) + assert len(datasets) == 0 + + +def test_convert_raw_files(ftp_data): + # Test conversion of raw files to netCDF + found_files = file_finder(ftp_data, "raw") + file_dicts = [file_integrity_checking(f) for f in found_files[:3]] + converted_files = convert_raw_files( + file_dicts, save_path=TEST_DATA_FOLDER, save_file_type="nc" + ) + for file in converted_files: + assert os.path.exists(file) + assert file.endswith(".nc") + + # Test conversion of raw files to zarr + converted_files = convert_raw_files( + file_dicts, save_path=TEST_DATA_FOLDER, save_file_type="zarr" + ) + for file in converted_files: + assert os.path.exists(file) + assert file.endswith(".zarr") + + # Test with an unsupported save file type + with pytest.raises( + Exception + ): # Assuming the function raises an exception for unsupported file types + convert_raw_files( + file_dicts, save_path=TEST_DATA_FOLDER, save_file_type="unsupported" + ) + + # Test with an empty save path + converted_files = convert_raw_files(file_dicts, save_file_type="nc") + for file in converted_files: + assert os.path.exists(file) + assert file.endswith(".nc") + + +def test_split_files(ftp_data): + # Test with a list of similar file dictionaries + found_files = file_finder(ftp_data, "raw") + file_dicts = [file_integrity_checking(f) for f in found_files[5:7]] + + grouped_files = split_files(file_dicts) + assert len(grouped_files) == 1 + assert len(grouped_files[0]) == 2 + + # Test with a list of dissimilar file dictionaries + found_files = file_finder(ftp_data, "raw") + file_dicts = [file_integrity_checking(f) for f in found_files[:3]] + grouped_files = split_files(file_dicts) + assert len(grouped_files) == 3 + assert len(grouped_files[0]) == 1 + assert len(grouped_files[1]) == 1 + + # Test with an empty list + with pytest.raises(Exception): + grouped_files = split_files([]) + + +def test_concatenate_files(ftp_data): + # Test with a list of valid file dictionaries + found_files = file_finder(ftp_data, "raw") + file_dicts = [file_integrity_checking(f) for f in found_files[5:7]] + converted_files = convert_raw_files( + file_dicts, save_path=TEST_DATA_FOLDER, save_file_type="nc" + ) + file_dicts = [file_integrity_checking(f) for f in converted_files] + concatenated_dataset = concatenate_files(file_dicts) + # Here, you might want to add more assertions based on the expected properties of the concatenated dataset + assert concatenated_dataset is not None