Skip to content

Commit

Permalink
processing refacto
Browse files Browse the repository at this point in the history
  • Loading branch information
svittoz committed Jan 10, 2024
1 parent ed2acb0 commit 38dc899
Show file tree
Hide file tree
Showing 15 changed files with 1,867 additions and 140 deletions.
65 changes: 7 additions & 58 deletions eds_scikit/biology/cleaning/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,20 +3,11 @@

from eds_scikit.biology.cleaning.cohort import select_cohort
from eds_scikit.biology.cleaning.transform import transform_measurement
from eds_scikit.biology.cleaning.utils import check_data_and_select_columns
from eds_scikit.biology.utils.process_concepts import (
ConceptsSet,
fetch_all_concepts_set,
get_concept_src_to_std,
)
from eds_scikit.biology.utils.process_measurement import (
filter_measurement_by_date,
get_measurement_std,
get_valid_measurement,
)
from eds_scikit.biology.utils.check_data import check_data_and_select_columns
from eds_scikit.biology.utils.process_concepts import ConceptsSet
from eds_scikit.biology.utils.prepare_measurement import prepare_measurement_table
from eds_scikit.io import settings
from eds_scikit.utils.typing import Data, DataFrame
from eds_scikit.biology.utils.prepare_df import prepare_biology_relationship

default_standard_terminologies = settings.standard_terminologies
default_standard_concept_regex = settings.standard_concept_regex
Expand All @@ -25,7 +16,7 @@
def bioclean(
data: Data,
concepts_sets: List[ConceptsSet] = None,
config_name: str = None,
config_name: str = None, #config_name
start_date: datetime = None,
end_date: datetime = None,
studied_cohort: Union[DataFrame, List[int]] = None,
Expand Down Expand Up @@ -61,56 +52,14 @@ def bioclean(
Data
Same as the input with the transformed `bioclean` table
"""
# Check the data and extract them
measurement, concept, concept_relationship = check_data_and_select_columns(data)

# Filter valid measurement
measurement_valid = get_valid_measurement(measurement)

# Filter measurement by date
measurement_timed = filter_measurement_by_date(
measurement_valid, start_date, end_date
)

# Query concepts-set information
#if concepts_sets is None:
# concepts_sets = fetch_all_concepts_set()

# Map biology concept
source_terminologies = {
"ANALYSES_LABORATOIRE": r"Analyses Laboratoire",
"GLIMS_ANABIO": r"GLIMS.{0,20}Anabio",
"GLIMS_LOINC": r"GLIMS.{0,20}LOINC",
"ANABIO_ITM": r"ITM - ANABIO",
"LOINC_ITM": r"ITM - LOINC",
}

mapping = [
("ANALYSES_LABORATOIRE", "GLIMS_ANABIO", "Maps to"),
("ANALYSES_LABORATOIRE", "GLIMS_LOINC", "Maps to"),
("GLIMS_ANABIO", "ANABIO_ITM", "Mapped from"),
("ANABIO_ITM", "LOINC_ITM", "Maps to"),
]

biology_relationship_table = prepare_biology_relationship(data,
source_terminologies,
mapping,
concepts_sets=concepts_sets)
measurements = prepare_measurement_table(data, start_date, end_date, concept_sets, cohort=None, convert_units=False, outliers_detection=None)

measurement_std_filtered = measurement_timed.merge(biology_relationship_table,
left_on="measurement_source_concept_id",
right_on=f"{mapping[0][0]}_concept_id")

# Extract concept-set
measurement_std_filtered = measurement_std_filtered.drop(columns=["measurement_source_concept_id"])
measurement_std_filtered = measurement_std_filtered.rename(columns={"GLIMS_ANABIO_concept_code" : "AnaBio_concept_code",
"GLIMS_LOINC_concept_code" : "LOINC_concept_code"})
measurement_std_filtered = measurement_std_filtered.drop(columns=["ANALYSES_LABORATOIRE_concept_code", "ANABIO_ITM_concept_code", "LOINC_ITM_concept_code"])
# Filter Measurement
if studied_cohort:
measurement_std_filtered = select_cohort(
measurement_std_filtered, studied_cohort
measurements, studied_cohort
)

# Transform values
data.bioclean = transform_measurement(measurement_std_filtered, clip, config_name)
data.bioclean = transform_measurement(measurements, clip, config_name)
Original file line number Diff line number Diff line change
@@ -1,10 +1,9 @@
from eds_scikit.utils.checks import check_columns, check_tables
from eds_scikit.utils.typing import Data


def check_data_and_select_columns(data: Data):
"""Check the required tables and columns in the Data and extract them
def check_data_and_select_columns_measurement(data: Data):
"""Check the required tables and columns in the Data and extract them.
Parameters
----------
data : Data
Expand Down Expand Up @@ -56,3 +55,44 @@ def check_data_and_select_columns(data: Data):
concept_relationship = data.concept_relationship[_relationship_required_columns]

return measurement, concept, concept_relationship


def check_data_and_select_columns_relationship(data: Data):
"""Check the required tables and columns in the Data and extract them.
Parameters
----------
data : Data
Instantiated [``HiveData``][eds_scikit.io.hive.HiveData], [``PostgresData``][eds_scikit.io.postgres.PostgresData] or [``PandasData``][eds_scikit.io.files.PandasData]
"""
check_tables(
data,
required_tables=[
"concept",
"concept_relationship",
],
)

_concept_required_columns = [
"concept_id",
"concept_name",
"concept_code",
"vocabulary_id",
]

_concept_relationship_required_columns = [
"concept_id_1",
"concept_id_2",
"relationship_id",
]

check_columns(data.concept, required_columns=_concept_required_columns)
check_columns(
data.concept_relationship,
required_columns=_concept_relationship_required_columns,
)

concept = data.concept[_concept_required_columns]
concept_relationship = data.concept_relationship[_relationship_required_columns]

return concept, concept_relationship
Loading

0 comments on commit 38dc899

Please sign in to comment.