Skip to content

Commit

Permalink
Merge pull request #15 from spirosmaggioros/main
Browse files Browse the repository at this point in the history
Fixed docstrings | Added workflow for unit testing automation | Added more test cases | replaced pytest with unittest | Fixed dependencies bugs |
  • Loading branch information
AlexanderGetka-cbica authored Jul 17, 2024
2 parents 250c16e + 6047310 commit 6a3f922
Show file tree
Hide file tree
Showing 13 changed files with 419 additions and 321 deletions.
33 changes: 33 additions & 0 deletions .github/workflows/macos_test_cases.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
name: spare_scores test cases on macos

# workflow dispatch has been added for testing purposes
on: [push, pull_request, workflow_dispatch]

jobs:
build:
runs-on: ["macos-latest"]

steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: '3.8'
- name: Set-up miniconda for macos and ubuntu
uses: conda-incubator/setup-miniconda@v2
with:
auto-update-conda: true
python-version: 3.8
miniconda-version: "latest"
- name: Create conda env
run: conda create -n spare python=3.8
- name: Install pip
run: conda run -n spare conda install pip
- name: Install spare scores
run: conda run -n spare pip install spare_scores
- name: Download dependencies
run: pip install setuptools && pip install .
- name: Run unit tests
run: |
cd tests/unit && python -m unittest discover -s . -p "*.py"
32 changes: 32 additions & 0 deletions .github/workflows/ubuntu_test_cases.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
name: spare_scores test cases on ubuntu

# workflow dispatch has been added for testing purposes
on: [push, pull_request, workflow_dispatch]

jobs:
build:
runs-on: ["ubuntu-latest"]

steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: '3.8'
- name: Set-up miniconda for macos and ubuntu
uses: conda-incubator/setup-miniconda@v2
with:
auto-update-conda: true
python-version: 3.8
miniconda-version: "latest"
- name: Create conda env
run: conda create -n spare python=3.8
- name: Install pip
run: conda run -n spare conda install pip
- name: Install spare scores
run: conda run -n spare pip install spare_scores
- name: Download dependencies
run: pip install setuptools && pip install .
- name: Run unit tests
run: |
cd tests/unit && python -m unittest discover -s . -p "*.py"
13 changes: 7 additions & 6 deletions dev-dependencies.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,29 +18,30 @@ jsonschema==4.17.3
kiwisolver==1.4.4
matplotlib==3.7.1
msgpack==1.0.5
numpy==1.24.4
numpy==1.23.5
packaging==23.1
pandas==2.0.3
Pillow==9.5.0
pkgutil_resolve_name==1.3.10
pluggy==1.2.0
pluggy==1.5.0
protobuf==4.23.3
pyparsing==3.1.0
pyrsistent==0.19.3
pytest==7.4.0
pytest==8.2.2
python-dateutil==2.8.2
pytz==2023.3
PyYAML==6.0
ray==2.5.1
requests==2.31.0
scikit-learn==1.2.2
scipy==1.10.1
scikit-learn==0.24.2
scipy==1.8.0
six==1.16.0
-e git+https://github.com/georgeaidinis/spare_score@3055a393e7aad704dd00dd378e45d695d99deebd#egg=spare_scores
threadpoolctl==3.1.0
tomli==2.0.1
torch==1.11.0
torch==2.3.1
typing_extensions==4.7.0
tzdata==2023.3
urllib3==2.0.3
zipp==3.15.0
setuptools==70.3.0
5 changes: 3 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,14 @@
include_package_data=True,
install_requires=['numpy',
'pandas',
'setuptools',
'scikit-learn',
'torch<2.1',
'torch<2.3.1',
'matplotlib',
'optuna'],
entry_points={
'console_scripts': ["spare_score = spare_scores.cli:main",
"spare_scores = spare_scores.cli:main",
"SPARE = spare_scores.cli:main"]
},
)
)
2 changes: 1 addition & 1 deletion spare_scores/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -307,4 +307,4 @@ def main():
arguments.logs)
return

return
return
75 changes: 39 additions & 36 deletions spare_scores/data_prep.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,21 +11,21 @@


def check_train(df: pd.DataFrame,
predictors: list,
to_predict: str,
key_var: str,
pos_group: str = '',
verbose: int = 1) -> Tuple[pd.DataFrame, list, str]:
predictors: list,
to_predict: str,
verbose: int = 1, # this needs to be removed(non used). If i remove
# it, then there are bugs to the test cases(check_train() unexpected argument verbose)
pos_group: str = '') -> Tuple[pd.DataFrame, list, str]:
"""Checks training dataframe for errors.
Args:
df: a pandas dataframe containing training data.
predictors: a list of predictors for SPARE model training.
to_predict: variable to predict.
pos_group: group to assign a positive SPARE score (only for classification).
df(pandas.DataFrame): a pandas dataframe containing training data.
predictors(list): a list of predictors for SPARE model training.
to_predict(str): variable to predict.
pos_group(str): group to assign a positive SPARE score (only for classification).
Returns:
a tuple containing 1) filtered dataframe, 2) filtered predictors, 3) SPARE model type.
Tuple[pandas.DataFrame, list, str]: a tuple containing 1)filtered dataframe, 2)filtered predictors, 3)SPARE model type.
"""
# GAI 26/04/2023: Removed check for existence of these columns
# if not {'ID','Age','Sex'}.issubset(set(df.columns)):
Expand Down Expand Up @@ -77,13 +77,12 @@ def check_train(df: pd.DataFrame,
return df, predictors, mdl_task

def check_test(df: pd.DataFrame,
meta_data: dict,
verbose: int = 1):
meta_data: dict):
"""Checks testing dataframe for errors.
Args:
df: a pandas dataframe containing testing data.
meta_data: a dictionary containing training information on its paired SPARE model.
df(pandas.DataFrame): a pandas dataframe containing testing data.
meta_data(dict): a dictionary containing training information on its paired SPARE model.
"""
############# Removing the hardcoded check for the below cols #############
# if not {'ID','Age','Sex'}.issubset(set(df.columns)):
Expand All @@ -106,31 +105,27 @@ def check_test(df: pd.DataFrame,
if np.sum(np.sum(pd.isna(df[meta_data['predictors']]))) > 0:
logging.warn('Some participants have invalid (missing or NaN values) predictor variables.')

############# Removing the hardcoded ID checks #############
if 'ID' not in df.columns:
# logging.info('"ID" column not found in the input dataframe. Treating all participants as independent from training.')
pass
else:
if 'ID' in df.columns:
if np.any(df['ID'].isin(meta_data['cv_results']['ID'])):
logging.info('Some participants seem to have been in the model training.')

return 'OK', None

def smart_unique(df1: pd.DataFrame,
df2: pd.DataFrame=None,
to_predict: str=None,
verbose: int=1) -> Union[pd.DataFrame, tuple]:
df2: pd.DataFrame=None,
to_predict: str=None) -> Union[pd.DataFrame, tuple]:
"""Select unique data points in a way that optimizes SPARE training.
For SPARE regression, preserve data points with extreme values.
For SPARE classification, preserve data points that help age match.
Args:
df1: a pandas dataframe.
df2: a pandas dataframe (optional) if df1 and df2 are two groups to classify.
to_predict: variable to predict. Binary for classification and continuous for regression.
df1(pandas.DataFrame)
df2(pandas.DataFrame): optional, if df1 and df2 are two groups to classify.
to_predict(str): variable to predict. Binary for classification and continuous for regression.
Must be one of the columnes in df. Ignored if df2 is given.
Returns:
a trimmed pandas dataframe or a tuple of two dataframes with only one time point per ID.
pandas.DataFrame: a trimmed pandas dataframe or a tuple of two dataframes with only one time point per ID.
"""
assert (isinstance(df2, pd.DataFrame) or (df2 is None)), (
'Either provide a 2nd pandas dataframe for the 2nd argument or specify it with "to_predict"')
Expand Down Expand Up @@ -191,20 +186,20 @@ def age_sex_match(df1: pd.DataFrame,
"""Match two groups for age and sex.
Args:
df1: a pandas dataframe.
df2: a pandas dataframe (optional) if df1 and df2 are two groups to classify.
to_match: a binary variable of two groups. Must be one of the columns in df.
df1(pandas.DataFrame)
df2(pandas.DataFrame): optional, if df1 and df2 are two groups to classify.
to_match(str): a binary variable of two groups. Must be one of the columns in df.
Ignored if df2 is given.
If to_match is 'Sex', then only perform age matching.
p_threshold: minimum p-value for matching.
verbose: whether to output messages.
age_out_percentage: percentage of the larger group to randomly select a participant to
p_threshold(float): minimum p-value for matching. Default value = 0.15
----------- verbose: whether to output messages.(Will be deprecated later)
age_out_percentage(float): percentage of the larger group to randomly select a participant to
take out from during the age matching. For example, if age_out_percentage = 20 and the
larger group is significantly older, then exclude one random participant from the fifth
quintile based on age.
quintile based on age. Default value = 20
Returns:
a trimmed pandas dataframe or a tuple of two dataframes with age/sex matched groups.
pandas.DataFrame: a trimmed pandas dataframe or a tuple of two dataframes with age/sex matched groups.
"""
assert (isinstance(df2, pd.DataFrame) or (df2 is None)), (
'Either provide a 2nd pandas dataframe for the 2nd argument or specify the two groups with "to_match"')
Expand Down Expand Up @@ -286,7 +281,15 @@ def age_sex_match(df1: pd.DataFrame,
else:
return (df1, df2)

def logging_basic_config(verbose=1, content_only=False, filename=''):
def logging_basic_config(verbose :int = 1, content_only = False, filename :str = ''):
"""
Basic logging configuration for error exceptions
Args:
verbose(int): input verbose. Default value = 1
content_only(bool): If set to True it will output only the needed content. Default value = False
filename(str): input filename. Default value = ''
"""
logging_level = {0:logging.WARNING, 1:logging.INFO, 2:logging.DEBUG, 3:logging.ERROR, 4:logging.CRITICAL}
fmt = ' %(message)s' if content_only else '%(levelname)s (%(funcName)s): %(message)s'
if filename != '' and filename is not None:
Expand All @@ -313,4 +316,4 @@ def convert_cat_variables(df, predictors, meta_data):
elif len(df[var].unique()) > 2:
raise ValueError('Categorical variables with more than 2 '
+ 'categories are currently not supported.')
return df, meta_data
return df, meta_data
29 changes: 17 additions & 12 deletions spare_scores/mlp.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,18 +28,23 @@ class MLPModel:
arguments. These will be added as attributes to the class.
Methods:
train_model(df, **kwargs):
Trains the model using the provided dataframe.
fit(df, verbose):
Trains the model using the provided dataframe and default parameters.
Args:
df(pandas.DataFrame): the provided dataframe.
verbose(int)
Returns:
dict: A dictionary with the results from training.
apply_model(df):
Applies the trained model on the provided dataframe and returns
the predictions.
set_parameters(**parameters):
Updates the model's parameters with the provided values. This also
changes the model's attributes, while retaining the original ones.
predict(df):
Predicts the result of the provided dataframe using the trained model.
Args:
df(pandas.DataFrame): the provided dataframe.
Returns:
list: The predictions from the trained model regarding the provided dataframe.
"""
def __init__(self, predictors, to_predict, key_var, verbose=1,**kwargs):
def __init__(self, predictors, to_predict, key_var, verbose=1, **kwargs):
logger = logging_basic_config(verbose, content_only=True)

self.predictors = predictors
Expand Down Expand Up @@ -130,7 +135,7 @@ def _fit(self, df):
self.get_stats(y, self.y_hat)

@ignore_warnings(category= (ConvergenceWarning,UserWarning))
def fit(self, df, verbose=1, **kwargs):
def fit(self, df, verbose=1) -> dict:
logger = logging_basic_config(verbose, content_only=True)


Expand Down Expand Up @@ -168,7 +173,7 @@ def fit(self, df, verbose=1, **kwargs):

return result

def predict(self, df, verbose=1):
def predict(self, df):

X = df[self.predictors]
X_transformed = self.scaler.transform(X)
Expand Down
Loading

0 comments on commit 6a3f922

Please sign in to comment.