hicsail · cbolles · Jun 18, 2024 · Jun 17, 2024 · Jun 17, 2024 · Jun 18, 2024
diff --git a/packages/frp-cli/frp_cli/main.py b/packages/frp-cli/frp_cli/main.py
@@ -1,18 +1,26 @@
 from argparse import ArgumentParser, BooleanOptionalAction
 from pathlib import Path
 from frp import FRPScholarlyAnalysis, Matcher
+import toml
 
 
-def scholarly_analysis(input_csv: Path, frp_title: str, frp_year: int, save_results: bool, save_output: Path) -> None:
+def scholarly_analysis(input_csv: Path, config_path: Path, frp_title: str, frp_year: int, save_results: bool, save_output: Path) -> None:
     if not input_csv.exists():
         print('File {} does not exist'.format(input_csv))
         exit(1)
 
+    if not config_path.exists():
+        print('File {} does not exist'.format(config_path))
+        exit(1)
+
+    with open(config_path, 'r') as config_file:
+        config = toml.load(config_file)
+
     # Make the matcher
-    matcher = Matcher()
+    matcher = Matcher(config['scholarly']['matcher'])
 
     # Run the analysis
-    analyzer = FRPScholarlyAnalysis(matcher)
+    analyzer = FRPScholarlyAnalysis(matcher, config['scholarly'])
 
     # Collect the results
     results = analyzer.run_frp_analysis(input_csv, frp_title, frp_year)
@@ -35,6 +43,9 @@ def main():
     scholarly_parser.add_argument('--input',
                                   required=True,
                                   help='Input CSV')
+    scholarly_parser.add_argument('--config',
+                                  required=True,
+                                  help='Location of the FRP config file')
     scholarly_parser.add_argument('--frp-title',
                                   required=False,
                                   default='Leveraging AI to Examine Disparities and Bias in Health Care',
@@ -58,7 +69,7 @@ def main():
 
     # Determine the correct command to run
     if args.command == 'scholarly':
-        scholarly_analysis(Path(args.input), args.frp_title, args.frp_year, args.save_output, Path(args.output_csv))
+        scholarly_analysis(Path(args.input), Path(args.config), args.frp_title, args.frp_year, args.save_output, Path(args.output_csv))
         return
     else:
         print('Command {} not recognized'.format(args.command))

diff --git a/packages/frp-cli/poetry.lock b/packages/frp-cli/poetry.lock
diff --git a/packages/frp-cli/pyproject.toml b/packages/frp-cli/pyproject.toml
@@ -13,12 +13,14 @@ main = "frp_cli:main"
 
 [tool.poetry.dependencies]
 python = "^3.9"
+toml = "^0.10.2"
 
 
 [tool.poetry.group.dev.dependencies]
 flake8 = "^7.1.0"
 black = "^24.4.2"
 mypy = "^1.10.0"
+types-toml = "^0.10.8.20240310"
 
 [build-system]
 requires = ["poetry-core"]

diff --git a/packages/frp/frp/matcher.py b/packages/frp/frp/matcher.py
@@ -27,39 +27,28 @@ def _type(self) -> str:
 
 
 class Matcher:
-    def __init__(self):
+    def __init__(self, config: dict):
+        # Pull out the needed configs
+        system_prompt = config['system_prompt']
+        human_prompt = config['human_prompt']
+        model_name = config['model_name']
+        model_base_url = config['model_base_url']
+
         # Build up the LangChain chain for handling the matching
-        prompt_template = self._get_prompt_template()
-        model = self._get_model()
+        prompt_template = self._get_prompt_template(system_prompt, human_prompt)
+        model = self._get_model(model_base_url, model_name)
         output_parser = self._get_output_parser()
 
         self._chain = prompt_template | model | output_parser
 
-    def _get_prompt_template(self) -> runnables.Runnable:
+    def _get_prompt_template(self, system_prompt: str, human_prompt: str) -> runnables.Runnable:
         return ChatPromptTemplate.from_messages([
-            ('system',
-             '''
-                You are an assistant tasked with classifying whether the given publication title
-                is associated with the given research topic.
-
-                Specifically, the content should be marked as relevant if it involves:
-                    1. Publications which are likely to have been written based on the research topic as a prompt.
-                    2. If the publication title has overlap with the research topic.
-
-                Generate a short response indicating whether the content meets any of the above criteria. Respond
-                with "Yes" for relevance or "No" if the publication does not have high overlap.
-             '''),
-            ('human', '''
-             Assess the given headline and article body based on the specified criteria. Provide a concise response indicating relevance.
-
-Publication Title: {publication_title}
-
-Research Topic: {frp_title}
-             ''')
+            ('system', system_prompt),
+            ('human', human_prompt)
         ])
 
-    def _get_model(self) -> runnables.Runnable:
-        return Ollama(base_url='https://ollama-sail-24887a.apps.shift.nerc.mghpcc.org', model='llama2:13b')
+    def _get_model(self, base_url: str, model_name: str) -> runnables.Runnable:
+        return Ollama(base_url=base_url, model=model_name)
 
     def _get_output_parser(self) -> runnables.Runnable:
         return BooleanOutputParser()

diff --git a/packages/frp/frp/scholarly.py b/packages/frp/frp/scholarly.py
@@ -7,9 +7,16 @@ class FRPScholarlyAnalysis:
     """
     Handles the cleaning and matching of MyCV CSVs to FRPs
     """
-    def __init__(self, matcher: Matcher):
+    def __init__(self, matcher: Matcher, config: dict):
         self._matcher = matcher
 
+        # Getting the mapping between columns and inputs
+        # to the matcher
+        # The mapping is between parameters in the matching
+        # to what they are referred to within the DataFrame.
+        # ex) publication_title: 'Title OR Chapter title'
+        self._mappings = config['matcher']['mappings']
+
     def _load(self, csv_location: Path) -> pd.DataFrame:
         """
         Read in the dataframe from the CSV. Does not additional
@@ -77,10 +84,16 @@ def _match(self, df: pd.DataFrame, frp_title: str) -> pd.DataFrame:
         """
         # Function which is applied to every row in the dataframe
         def apply_matcher(row: pd.Series) -> pd.Series:
+            # First get all shared mappings
             mapping = {
-                'publication_title': row['Title OR Chapter title'],
                 'frp_title': frp_title
             }
+
+            # Then, add in the values from the row as defined in the
+            # config
+            for key, value in self._mappings.items():
+                mapping[key] = row[value]
+
             return pd.Series(self._matcher.match(mapping))
 
         # Make a copy of the data any apply the matching row-by-row