JanaLasser · raindrift · Aug 12, 2024 · Aug 14, 2024 · Aug 15, 2024 · Aug 15, 2024
diff --git a/.DS_Store b/.DS_Store
diff --git a/.gitignore b/.gitignore
@@ -11,6 +11,9 @@ __pycache__/
 build/
 develop-eggs/
 dist/
+lib/
+bin/
+pyvenv.cfg
 downloads/
 eggs/
 .eggs/
@@ -159,3 +162,6 @@ cython_debug/
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
+
+joaopn/*
+
diff --git a/Dockerfile b/Dockerfile
@@ -1,34 +1,81 @@
-# Use an official NVIDIA CUDA base image with Ubuntu
-FROM nvidia/cuda:12.5.0-base-ubuntu22.04
+# Use an official Python 3.11 base image for AMD64
+FROM python:3.11-slim-bullseye
 
 # Set the working directory
 WORKDIR /app
 
-# Install wget and bzip2, necessary for Miniconda installation
-RUN apt-get update && apt-get install -y wget bzip2
+# Install system dependencies and build tools
+RUN apt-get update && apt-get install -y \
+    wget \
+    bzip2 \
+    gcc \
+    g++ \
+    && rm -rf /var/lib/apt/lists/*
 
-# Install Miniconda
+# Install Miniconda for AMD64
 RUN wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh && \
-    bash miniconda.sh -b -p /miniconda && \
+    chmod +x miniconda.sh && \
+    ./miniconda.sh -b -p /miniconda && \
     rm miniconda.sh
 
+# Install Miniconda for aarch64 (for container debugging on M1 Mac)
+# RUN wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-aarch64.sh -O miniconda.sh && \
+#     chmod +x miniconda.sh && \
+#     ./miniconda.sh -b -p /miniconda && \
+#     rm miniconda.sh
+
+
 # Add Miniconda to PATH
 ENV PATH="/miniconda/bin:${PATH}"
 
+# install mamba since conda has trouble properly resolving dependencies
+RUN conda install -c conda-forge -y mamba
+
 # Copy the environment.yml file
 COPY environment.yml /app/environment.yml
 
 # Create the Conda environment
-RUN conda env create -f environment.yml
+RUN mamba env create --debug -f environment.yml
+RUN mamba clean -afy
 
-# Make RUN commands use the new environment:
+# Activate the conda environment
 SHELL ["conda", "run", "-n", "ranker", "/bin/bash", "-c"]
 
-# Ensure the Python executable used is from the conda environment
-ENV PATH /miniconda/envs/ranker/bin:$PATH
+COPY model_download.py /app/model_download.py
+
+RUN conda run -n ranker python model_download.py
+
+# Anaconda should install of these packages for us.
+#
+# # Install PyTorch for CPU (since we're not using CUDA in this setup)
+# RUN pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cpu
+
+# # Install additional packages
+# RUN pip install --no-cache-dir \
+#     --extra-index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/onnxruntime-cuda-12/pypi/simple/ \
+#     optimum[onnxruntime-gpu] \
+#     # optimum[onnxruntime] \
+#     fastapi==0.111.0 \
+#     fasttext_wheel==0.9.2 \
+#     huggingface_hub==0.23.3 \
+#     lexicalrichness==0.5.1 \
+#     numpy==1.26.4 \
+#     pandas==2.2.2 \
+#     pydantic==2.7.4 \
+#     ranking_challenge==2.0.0 \
+#     sentence_transformers==3.0.1 \
+#     transformers==4.41.2 \
+#     uvicorn==0.30.1 \
+#     simplejson \
+#     numexpr \
+#     bottleneck
 
 # Copy the rest of your application's code
 COPY . /app
 
 # Expose the port the app runs on
-EXPOSE 8000
+EXPOSE 8000
+
+# Set the default command to run your FastAPI application
+# CMD ["conda", "run", "--no-capture-output", "-n", "ranker", "python", "main.py", "--port", "8000", "--scroll_warning_limit", "-0.1", "--batch_size", "8"]
+CMD ["conda", "run", "--no-capture-output", "-n", "ranker", "python", "start_server.py", "--port", "8000", "--batch_size", "8", "--scroll_warning_limit", "-0.1"]
diff --git a/README.md b/README.md
@@ -1,6 +1,12 @@
 # Civirank
 Repo for the Civirank ranker.
 
+## Building with docker
+
+1. Download model.onnx from https://huggingface.co/joaopn/unbiased-toxic-roberta-onnx-fp16/tree/main
+2. Put it in the project root
+3. Build docker container
+
 # Submission form
 
 * **Describe your algorithm. How will you reorder, add, and/or remove content?**  (Up to 250 words.)
@@ -73,12 +79,12 @@ I would like to add back two items from the addiction scale, replacing "phone" w
 3. *A few posts ago we showed you a warning, encouraging you to stop scrolling in your feed. Why did you keep scrolling?* [I didn't notice the warning, I didn't want to stop, I was curious about the posts after the warning, None of the above, I don't know]
 
 * **What is new about your approach? Why do you believe that it will work, and what evidence do you have? Please discuss your theory of change and include citations to previous work.** (Up to 250 words.)
-I believe my ranker has two new components: 
+I believe my ranker has two new components:
 (1) A balanced ranking that considers various quality dimensions with the overall aim of improving civic discourse.
-(2) "Scroll warnings" reducing the social media usage if it would mainly cause users to see low-quality content. 
+(2) "Scroll warnings" reducing the social media usage if it would mainly cause users to see low-quality content.
 
 Below I discuss why I believe it will work:
- 
+
 - Reducing toxic content might prevent members of minority groups from leaving or self-censoring (hateful content directed at minorities has these effects https://doi.org/10.1177/1043986213507403).
 - Reducing content high in affective polarization might directly reduce affective polarization. Facebook & Instagram election study (https://doi.org/10.1126/science.abp9364) didn't find a change in affective polarizatio, but they only tested curated vs. chronological feed, not re-ranking based on affective polarization. I am interested to see if this makes a difference.
 - Prosocial features in language predict a prosocial trajectory of conversations (see https://doi.org/10.1145/3442381.3450122). I expect people to have an overall better time using the platform if they see more prosocial content, keeping up retention. I also expect them to see less content "bad for the world" that is not caught by the toxicity ranking.
@@ -92,7 +98,7 @@ Below I discuss why I believe it will work:
 The prototype was build using Python. Dependencies are the libraries numpy, pandas, sentence_transformers, PyTorch, googleapiclient, lexicalrichness and langdetect (I can provide a requirements.txt file). In addition, the prototype depends on pre-calculated embeddings of the polarization and prosociality dictionaries, and the NewsGuard trustworthiness scores, provided in the form of a .csv file. In addition, the prototype currently uses the Google Perspective API to calculate toxicity scores.
 
 * **Where can we find your submission? This needs to be a URL to a live HTTP endpoint implementing our first-round specifications.**
-Ranker can be found at http://5.75.245.130:5001/rank. Note that this is currently really slow (takes about 1 min to rank ~600 posts). The main bottleneck is the current sequential toxicitiy inference with the perspective API, which will be switched to a locally hosted alternative. 
+Ranker can be found at http://5.75.245.130:5001/rank. Note that this is currently really slow (takes about 1 min to rank ~600 posts). The main bottleneck is the current sequential toxicitiy inference with the perspective API, which will be switched to a locally hosted alternative.
 
 * **Have any members of your team previously published related work? Please provide up to five links to previous projects, code, papers, etc.**
 - Social media sharing of low quality news sources by political elites https://doi.org/10.1093/pnasnexus/pgac186. In this paper we use the same approach of assessing trustworthiness of news pieces on the domain level with NewsGuard scores.

diff --git a/civirank/rankers.py b/civirank/rankers.py
@@ -29,11 +29,32 @@ def __init__(self, weights=None, lim=False, min_scores=0, debug=False, warning_u
             self.warning_urls = warning_urls
 
         # Initialize analyzers
-        self.TrustworthinessAnalyzer = analyzers.TrustworthinessAnalyzer()
-        self.ToxicityAnalyzer = analyzers.ToxicityAnalyzer()
-        self.ProsocialityPolarizationAnalyzer = analyzers.ProsocialityPolarizationAnalyzer()
-        self.LexicalDensityAnalyzer = analyzers.LexicalDensityAnalyzer()
-        self.LanguageAnalyzer = analyzers.LanguageAnalyzer()
+        try:
+            print("Initializing TrustworthinessAnalyzer...")
+            self.TrustworthinessAnalyzer = analyzers.TrustworthinessAnalyzer()
+            print("TrustworthinessAnalyzer initialized.")
+
+            print("Initializing ToxicityAnalyzer...")
+            self.ToxicityAnalyzer = analyzers.ToxicityAnalyzer()
+            print("ToxicityAnalyzer initialized.")
+
+            print("Initializing ProsocialityPolarizationAnalyzer...")
+            self.ProsocialityPolarizationAnalyzer = analyzers.ProsocialityPolarizationAnalyzer()
+            print("ProsocialityPolarizationAnalyzer initialized.")
+
+            print("Initializing LexicalDensityAnalyzer...")
+            self.LexicalDensityAnalyzer = analyzers.LexicalDensityAnalyzer()
+            print("LexicalDensityAnalyzer initialized.")
+
+            print("Initializing LanguageAnalyzer...")
+            self.LanguageAnalyzer = analyzers.LanguageAnalyzer()
+            print("LanguageAnalyzer initialized.")
+
+        except Exception as e:
+            print(f"Error during analyzer initialization: {str(e)}")
+            raise
+
+        print("LocalRankerGradual initialization completed.")
 
         # Scores that are considered in the compound score
         self.scores = ['no_toxicity', 'no_polarization', 'mtld', 'trustworthiness', 'prosociality']
@@ -48,85 +69,88 @@ def __init__(self, weights=None, lim=False, min_scores=0, debug=False, warning_u
         self.debug = debug
 
     def rank(self, ranking_request, batch_size=16, scroll_warning_limit=-0.1):
-
-        # Check if ranking_request is a RankingRequest object or a dictionary
-        if isinstance(ranking_request, RankingRequest):
-            dataset = ranking_request.dict()
-        else:
-            dataset = ranking_request
-
-        platform = dataset["session"]["platform"]
-
-        # Detect language of each post
-        for i in range(len(dataset["items"])):
-            dataset['items'][i]['lang'] = self.LanguageAnalyzer.detect_language(dataset['items'][i]['text'].replace('\n', ' '))
-            if self.debug:
-                dataset['items'][i]['original_rank'] = i
-
-        if self.debug:
-            print("{:d} posts not in English.".format(len(dataset["items"]) - len([item for item in dataset["items"] if item["lang"] == "en"])))
-
-            #prints value_counts of languages
-            print(pd.DataFrame([item["lang"] for item in dataset["items"]], columns=["lang"])["lang"].value_counts())
-
-        # Parse posts
-        if platform == "twitter":
-            posts = parsers.parse_twitter_posts(dataset["items"], lim=self.lim, debug=self.debug)
-        elif platform == "reddit":
-            posts = parsers.parse_reddit_posts(dataset["items"], lim=self.lim, debug=self.debug)
-        elif platform == "facebook":
-            posts = parsers.parse_facebook_posts(dataset["items"], lim=self.lim, debug=self.debug)
-
-        # Splits the posts into ones that get reranked and ones that don't
-        parse_posts = posts[(posts["lang"] == "en") & (posts["text"].str.len() > 0)].copy()
-        non_parse_posts = posts[(posts["lang"] != "en") | (posts["text"].str.len() == 0)].copy()
-
-        # Process posts
-        parse_posts.loc[:, "trustworthiness"] = self.TrustworthinessAnalyzer.get_trustworthiness_scores(parse_posts)
-        parse_posts.loc[:, "toxicity"] = self.ToxicityAnalyzer.get_toxicity_scores(parse_posts, batch_size=batch_size)
-        parse_posts.loc[:, "polarization"] = self.ProsocialityPolarizationAnalyzer.get_similarity_polarization(parse_posts)
-        parse_posts.loc[:, "prosociality"] = self.ProsocialityPolarizationAnalyzer.get_similarity_prosocial(parse_posts)
-        parse_posts.loc[:, "mtld"] = self.LexicalDensityAnalyzer.get_mtld(parse_posts)
-
-        parse_posts = analyzers.normalize(parse_posts)
-
-        # Calculate the compound score
-        parse_posts["compound_score"] = parse_posts[self.scores].apply(analyzers.calculate_compound_score, args=(self.weights, self.min_scores), axis=1)
-
-        # Sort posts in descending order based on compound score
-        parse_posts = parse_posts.sort_values(by="compound_score", ascending=False)
-
-        # Create a list to store final posts in the correct order
-        final_posts = []
-        en_index = 0
-        non_en_index = 0
-
-        # Reinsert posts to their original positions
-        for idx in range(len(posts)):
-            if posts.iloc[idx]["lang"] == "en" and posts.iloc[idx]["text"].strip() != "":
-                final_posts.append(parse_posts.iloc[en_index])
-                en_index += 1
+        try:
+            # Check if ranking_request is a RankingRequest object or a dictionary
+            if isinstance(ranking_request, RankingRequest):
+                dataset = ranking_request.dict()
             else:
-                final_posts.append(non_parse_posts.iloc[non_en_index])
-                non_en_index += 1
-
-        # Reset index for the final_posts list
-        final_posts_df = pd.DataFrame(final_posts).reset_index(drop=True)
-
-        # Inserts a warning message for the scroll component
-        insert_index = final_posts_df[final_posts_df['compound_score'] < scroll_warning_limit].first_valid_index()
-        if insert_index is not None:
-            id_platform = self.warning_urls[platform]['id']
-            new_row = pd.DataFrame({'id': id_platform, 'compound_score': scroll_warning_limit}, index=[insert_index - 0.5])
-            final_posts_df = pd.concat([final_posts_df.iloc[:insert_index], new_row, final_posts_df.iloc[insert_index:]]).reset_index(drop=True)
-
-
-        # Return full dataframe with original dataset and scores if DEBUG is True
-        if self.debug:
-            return final_posts_df
+                dataset = ranking_request
+
+            platform = dataset["session"]["platform"]
+
+            # Detect language of each post
+            for i in range(len(dataset["items"])):
+                try:
+                    dataset['items'][i]['lang'] = self.LanguageAnalyzer.detect_language(dataset['items'][i]['text'].replace('\n', ' '))
+                except Exception as e:
+                    print(f"Error detecting language for item {i}: {str(e)}")
+                    dataset['items'][i]['lang'] = 'unknown'
+                if self.debug:
+                    dataset['items'][i]['original_rank'] = i
+
+            # Parse posts
+            if platform == "twitter":
+                posts = parsers.parse_twitter_posts(dataset["items"], lim=self.lim, debug=self.debug)
+            elif platform == "reddit":
+                posts = parsers.parse_reddit_posts(dataset["items"], lim=self.lim, debug=self.debug)
+            elif platform == "facebook":
+                posts = parsers.parse_facebook_posts(dataset["items"], lim=self.lim, debug=self.debug)
+            else:
+                raise ValueError(f"Unsupported platform: {platform}")
+
+            # Splits the posts into ones that get reranked and ones that don't
+            parse_posts = posts[(posts["lang"] == "en") & (posts["text"].str.len() > 0)].copy()
+            non_parse_posts = posts[(posts["lang"] != "en") | (posts["text"].str.len() == 0)].copy()
+
+            # Process posts
+            parse_posts.loc[:, "trustworthiness"] = self.TrustworthinessAnalyzer.get_trustworthiness_scores(parse_posts)
+            parse_posts.loc[:, "toxicity"] = self.ToxicityAnalyzer.get_toxicity_scores(parse_posts, batch_size=batch_size)
+            parse_posts.loc[:, "polarization"] = self.ProsocialityPolarizationAnalyzer.get_similarity_polarization(parse_posts)
+            parse_posts.loc[:, "prosociality"] = self.ProsocialityPolarizationAnalyzer.get_similarity_prosocial(parse_posts)
+            parse_posts.loc[:, "mtld"] = self.LexicalDensityAnalyzer.get_mtld(parse_posts)
+
+            parse_posts = analyzers.normalize(parse_posts)
+
+            # Calculate the compound score
+            parse_posts["compound_score"] = parse_posts[self.scores].apply(analyzers.calculate_compound_score, args=(self.weights, self.min_scores), axis=1)
+
+            # Sort posts in descending order based on compound score
+            parse_posts = parse_posts.sort_values(by="compound_score", ascending=False)
+
+            # Create a list to store final posts in the correct order
+            final_posts = []
+            en_index = 0
+            non_en_index = 0
+
+            # Reinsert posts to their original positions
+            for idx in range(len(posts)):
+                if posts.iloc[idx]["lang"] == "en" and posts.iloc[idx]["text"].strip() != "":
+                    final_posts.append(parse_posts.iloc[en_index])
+                    en_index += 1
+                else:
+                    final_posts.append(non_parse_posts.iloc[non_en_index])
+                    non_en_index += 1
+
+            # Reset index for the final_posts list
+            final_posts_df = pd.DataFrame(final_posts).reset_index(drop=True)
+
+            # Inserts a warning message for the scroll component
+            insert_index = final_posts_df[final_posts_df['compound_score'] < scroll_warning_limit].first_valid_index()
+            if insert_index is not None:
+                id_platform = self.warning_urls[platform]['id']
+                new_row = pd.DataFrame({'id': id_platform, 'compound_score': scroll_warning_limit}, index=[insert_index - 0.5])
+                final_posts_df = pd.concat([final_posts_df.iloc[:insert_index], new_row, final_posts_df.iloc[insert_index:]]).reset_index(drop=True)
+
+            # Return full dataframe with original dataset and scores if DEBUG is True
+            if self.debug:
+                return final_posts_df
 
-        # Otherwise, return list of ids
-        if insert_index is not None:
-            return list(final_posts_df["id"]), [self.warning_urls[platform]]
-        else:
-            return list(final_posts_df["id"]), []
+            # Otherwise, return list of ids
+            if insert_index is not None:
+                return list(final_posts_df["id"]), [self.warning_urls[platform]]
+            else:
+                return list(final_posts_df["id"]), []
+
+        except Exception as e:
+            print(f"Error in rank method: {str(e)}")
+            raise
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -1,5 +1,6 @@
 services:
   civirank:
+    image: civirank-civirank
     build:
       context: .
       dockerfile: Dockerfile
@@ -8,7 +9,7 @@ services:
     environment:
       BATCH_SIZE: "${BATCH_SIZE}"
       SCROLL_WARNING_LIMIT: "${SCROLL_WARNING_LIMIT}"
-    command: ["python", "start_server.py", "--port", "${PORT}", "--batch_size", "${BATCH_SIZE}", "--scroll_warning_limit", "${SCROLL_WARNING_LIMIT}"]
+    command: ["conda", "run", "--no-capture-output", "-n", "ranker", "python", "start_server.py", "--port", "${PORT}", "--batch_size", "${BATCH_SIZE}", "--scroll_warning_limit", "${SCROLL_WARNING_LIMIT}"]
     volumes:
       - .:/app
     deploy:
@@ -17,4 +18,4 @@ services:
           devices:
             - driver: nvidia
               capabilities: [gpu]
-              count: 1 
+              count: 1
diff --git a/environment.yml b/environment.yml
@@ -7,7 +7,7 @@ channels:
 dependencies:
   - python=3.11
   - pytorch
-  - pytorch-cuda=12.1
+  # - pytorch-cuda=12.1
   - pandas
   - pip
   - pip:
@@ -26,4 +26,4 @@ dependencies:
       - uvicorn==0.30.1
       - simplejson
       - numexpr
-      - bottleneck
+      - bottleneck