Merge branch 'master' of https://github.com/HumanSignal/label-studio-…

…ml-backend
HumanSignal · Apr 10, 2024 · e02bf46 · e02bf46
2 parents a78f6ab + 6234415
commit e02bf46
Show file tree

Hide file tree

Showing 93 changed files with 1,330 additions and 1,225 deletions.
diff --git a/.github/scripts/validate_dockerignore.sh b/.github/scripts/validate_dockerignore.sh
@@ -0,0 +1,42 @@
+#!/bin/bash
+
+DUMMY_DOCKERFILE_CONTENT="FROM alpine\nCOPY . /app"
+TEMP_IMAGE_BASE="temp_context_image"
+
+# Loop through all directories containing a .dockerignore file
+for dir in ../../label_studio_ml/examples/*/; do
+  if [ -f "$dir/.dockerignore" ]; then
+    echo "Checking directory: $dir"
+
+    # Navigate into the directory
+    pushd "$dir" >/dev/null
+
+    # Create a temporary dummy Dockerfile
+    echo -e "$DUMMY_DOCKERFILE_CONTENT" >Dockerfile.tmp
+
+    # Define a unique temporary image name using directory name to avoid conflicts
+    TEMP_IMAGE="${TEMP_IMAGE_BASE}_$(basename "$dir")"
+
+    # Build the temporary image and get its ID
+    docker build -q -f Dockerfile.tmp -t "$TEMP_IMAGE" . >/dev/null
+
+    # Remove the temporary Dockerfile
+    rm -f Dockerfile.tmp
+
+    # List all files excluding the temporary Dockerfile, sorted for comparison
+    LOCAL_FILES=$(find . -mindepth 1 -type f -not -name "Dockerfile.tmp" | grep -v '.dockerignore\|README.md' | sort)
+
+    # Use a Docker container to list all files included in the build context, simulating .dockerignore application
+    INCLUDED_FILES=$(docker run --rm -w /app "${TEMP_IMAGE}" find . -mindepth 1 -type f | grep -v '.dockerignore\|README.md' | sort)
+
+    # Pop back to the parent directory
+    popd >/dev/null
+
+    # Compare the lists to find files not ignored by .dockerignore (included in Docker context)
+    echo "Files ignored by .dockerignore:"
+    comm -23 <(echo "$LOCAL_FILES") <(echo "$INCLUDED_FILES")
+    echo "---------------------------------------------------------------------------------------------------------------------------"
+  else
+    echo "No .dockerignore found in $dir"
+  fi
+done
diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
@@ -48,6 +48,8 @@ jobs:
           token: "${{ secrets.GIT_PAT }}"
           repository: "${{ github.repository_owner }}/${{ env.DOCS_REPO_NAME }}"
           path: "${{ env.DOCS_REPO_PATH }}"
+          ref: docs-ml-backend-preview
+# TODO: Remove "ref: docs-ml-backend-preview" for prod deployment
 
       - name: Generate Docs
         env:

diff --git a/README.md b/README.md
diff --git a/label_studio_ml/default_configs/.dockerignore b/label_studio_ml/default_configs/.dockerignore
@@ -0,0 +1,18 @@
+# Exclude everything
+**
+
+# Include Dockerfile and docker-compose for reference (optional, decide based on your use case)
+!Dockerfile
+!docker-compose.yml
+
+# Include Python application files
+!*.py
+
+# Include requirements files
+!requirements*.txt
+
+# Include script
+!*.sh
+
+# Exclude specific requirements if necessary
+# requirements-test.txt (Uncomment if you decide to exclude this)
diff --git a/label_studio_ml/default_configs/docker-compose.yml b/label_studio_ml/default_configs/docker-compose.yml
@@ -21,6 +21,14 @@ services:
       - THREADS=8
       # specify the model directory (likely you don't need to change this)
       - MODEL_DIR=/data/models
+
+      # Specify the Label Studio URL and API key to access
+      # uploaded, local storage and cloud storage files.
+      # Do not use 'localhost' as it does not work within Docker containers.
+      # Use prefix 'http://' or 'https://' for the URL always.
+      # Determine the actual IP using 'ifconfig' (Linux/Mac) or 'ipconfig' (Windows).
+      - LABEL_STUDIO_URL=
+      - LABEL_STUDIO_API_KEY=
     ports:
       - "9090:9090"
     volumes:

diff --git a/label_studio_ml/default_configs/model.py b/label_studio_ml/default_configs/model.py
@@ -8,11 +8,10 @@ class NewModel(LabelStudioMLBase):
     """
 
     def setup(self):
-        """Configure any paramaters of your model here
+        """Configure any parameters of your model here
         """
         self.set("model_version", "0.0.1")
 
-
     def predict(self, tasks: List[Dict], context: Optional[Dict] = None, **kwargs) -> ModelResponse:
         """ Write your inference logic here
             :param tasks: [Label Studio tasks in JSON format](https://labelstud.io/guide/task_format.html)
@@ -29,6 +28,10 @@ def predict(self, tasks: List[Dict], context: Optional[Dict] = None, **kwargs) -
         Parsed JSON Label config: {self.parsed_label_config}
         Extra params: {self.extra_params}''')
 
+        # example for resource downloading from Label Studio instance,
+        # you need to set env vars LABEL_STUDIO_URL and LABEL_STUDIO_API_KEY
+        # path = self.get_local_path(tasks[0]['data']['image_url'], task_id=tasks[0]['id'])
+
         # example for simple classification
         # return [{
         #     "model_version": self.get("model_version"),
@@ -45,7 +48,6 @@ def predict(self, tasks: List[Dict], context: Optional[Dict] = None, **kwargs) -
         # }]
 
         return ModelResponse(predictions=[])
-
 
     def fit(self, event, data, **kwargs):
         """

diff --git a/label_studio_ml/examples/bert_classifier/.dockerignore b/label_studio_ml/examples/bert_classifier/.dockerignore
@@ -0,0 +1,18 @@
+# Exclude everything
+**
+
+# Include Dockerfile and docker-compose for reference (optional, decide based on your use case)
+!Dockerfile
+!docker-compose.yml
+
+# Include Python application files
+!*.py
+
+# Include requirements files
+!requirements*.txt
+
+# Include script
+!*.sh
+
+# Exclude specific requirements if necessary
+# requirements-test.txt (Uncomment if you decide to exclude this)
diff --git a/label_studio_ml/examples/bert_classifier/README.md b/label_studio_ml/examples/bert_classifier/README.md
@@ -26,7 +26,7 @@ $ curl http://localhost:9090/
 {"status":"UP"}
 ```
 
-3. Connect to the backend from Label Studio running on the same host: go to your project `Settings -> Machine Learning -> Add Model` and specify `http://localhost:9090` as a URL.
+3. Connect to the backend from Label Studio running on the same host: go to your project `Settings -> Model -> Connect Model` and specify `http://localhost:9090` as a URL.
 
 > Warning! Please note the current limitation of the ML backend: models are loaded dynamically from huggingface.co. You may need `HF_TOKEN` env variable provided in your environment. Consequently, this may result in a slow response time for the first prediction request. If you are experiencing timeouts on Label Studio side (i.e., no predictions are visible when opening the task), please check the logs of the ML backend for any errors, and refresh the page in a few minutes.
 
@@ -76,7 +76,9 @@ The following parameters are available for training:
 - `LEARNING_RATE`: The learning rate for the model training. Default is 2e-5.
 - `NUM_TRAIN_EPOCHS`: The number of epochs for model training. Default is 3.
 - `WEIGHT_DECAY`: The weight decay for the model training. Default is 0.01.
-- `FINETUNED_MODEL_NAME`: The name of the fine-tuned model. Default is finetuned-model.
+- `FINETUNED_MODEL_NAME`: The name of the fine-tuned model. Default is `finetuned_model`. Checkpoints will be saved under this name.
+
+> Note: The `LABEL_STUDIO_API_KEY` is required for training the model. You can find the API key in the Label Studio instance in the `Account & Settings -> Access Token` section.
 
 
 # Customization

diff --git a/label_studio_ml/examples/bert_classifier/docker-compose.yml b/label_studio_ml/examples/bert_classifier/docker-compose.yml
@@ -13,12 +13,12 @@ services:
       - LABEL_STUDIO_HOST=http://localhost:8080
       - LABEL_STUDIO_API_KEY=your-api-key
       # Use any model for [AutoModelForSequenceClassification](https://huggingface.co/transformers/v3.0.2/model_doc/auto.html#automodelforsequenceclassification)
-      - BASELINE_MODEL_NAME=bert-base-uncased
+      - BASELINE_MODEL_NAME=bert-base-multilingual-cased
       # - BASELINE_MODEL_NAME=google/electra-small-discriminator
       # The model directory for the fine-tuned checkpoints (relative to $MODEL_DIR)
-      - FINETUNED_MODEL_NAME=finetuned-model-path
+      - FINETUNED_MODEL_NAME=finetuned_model
       # The number of labeled tasks to download from Label Studio before starting training
-      - START_TRAINING_EACH_N_UPDATES=1
+      - START_TRAINING_EACH_N_UPDATES=10
       # Learning rate
       - LEARNING_RATE=2e-5
       # Number of epochs

diff --git a/label_studio_ml/examples/bert_classifier/model.py b/label_studio_ml/examples/bert_classifier/model.py
@@ -117,6 +117,9 @@ def predict(self, tasks: List[Dict], context: Optional[Dict] = None, **kwargs) -
 
     def fit(self, event, data, **additional_params):
         """Download dataset from Label Studio and prepare data for training in BERT"""
+        if event not in ('ANNOTATION_CREATED', 'ANNOTATION_UPDATED'):
+            logger.info(f"Skip training: event {event} is not supported")
+            return
         project_id = data['annotation']['project']
 
         # dowload annotated tasks from Label Studio

diff --git a/label_studio_ml/examples/easyocr/.dockerignore b/label_studio_ml/examples/easyocr/.dockerignore
@@ -0,0 +1,18 @@
+# Exclude everything
+**
+
+# Include Dockerfile and docker-compose for reference (optional, decide based on your use case)
+!Dockerfile
+!docker-compose.yml
+
+# Include Python application files
+!*.py
+
+# Include requirements files
+!requirements*.txt
+
+# Include script
+!*.sh
+
+# Exclude specific requirements if necessary
+!label_mappings.json
diff --git a/label_studio_ml/examples/easyocr/docker-compose.yml b/label_studio_ml/examples/easyocr/docker-compose.yml
@@ -21,6 +21,14 @@ services:
       - THREADS=8
       # specify the model directory (likely you don't need to change this)
       - MODEL_DIR=/data/models
+
+      # Specify the Label Studio URL and API key to access
+      # uploaded, local storage and cloud storage files.
+      # Do not use 'localhost' as it does not work within Docker containers.
+      # Use prefix 'http://' or 'https://' for the URL always.
+      # Determine the actual IP using 'ifconfig' (Linux/Mac) or 'ipconfig' (Windows).
+      - LABEL_STUDIO_URL=
+      - LABEL_STUDIO_API_KEY=
     ports:
       - "9090:9090"
     volumes:

diff --git a/label_studio_ml/examples/easyocr/model.py b/label_studio_ml/examples/easyocr/model.py
@@ -29,9 +29,14 @@ class NewModel(LabelStudioMLBase):
     DEVICE = os.getenv('DEVICE', 'cuda')
     # Maximum different in box height. Boxes with very different text size should not be merged.
     HEIGHT_THS = float(os.getenv('HEIGHT_THS', 0.8))
-    # Label Studio image upload folder - should be used only in case you use direct file upload into Label Studio instead of URLs
-    LABEL_STUDIO_ACCESS_TOKEN = os.environ.get("LABEL_STUDIO_ACCESS_TOKEN")
-    LABEL_STUDIO_HOST = os.environ.get("LABEL_STUDIO_HOST")
+    # Label Studio image upload folder:
+    # should be used only in case you use direct file upload into Label Studio instead of URLs
+    LABEL_STUDIO_ACCESS_TOKEN = (
+        os.environ.get("LABEL_STUDIO_ACCESS_TOKEN") or os.environ.get("LABEL_STUDIO_API_KEY")
+    )
+    LABEL_STUDIO_HOST = (
+            os.environ.get("LABEL_STUDIO_HOST") or os.environ.get("LABEL_STUDIO_URL")
+    )
 
     MODEL_DIR = os.getenv('MODEL_DIR', '.')
 
@@ -98,7 +103,8 @@ def predict_single(self, task):
             image_url,
             cache_dir=cache_dir,
             hostname=self.LABEL_STUDIO_HOST,
-            access_token=self.LABEL_STUDIO_ACCESS_TOKEN
+            access_token=self.LABEL_STUDIO_ACCESS_TOKEN,
+            task_id=task.get('id')
         )
         model_results = self.model.readtext(image_path, height_ths=self.HEIGHT_THS)
         if not model_results:

diff --git a/label_studio_ml/examples/flair/.dockerignore b/label_studio_ml/examples/flair/.dockerignore
@@ -0,0 +1,18 @@
+# Exclude everything
+**
+
+# Include Dockerfile and docker-compose for reference (optional, decide based on your use case)
+!Dockerfile
+!docker-compose.yml
+
+# Include Python application files
+!*.py
+
+# Include requirements files
+!requirements*.txt
+
+# Include script
+!*.sh
+
+# Exclude specific requirements if necessary
+# requirements-test.txt (Uncomment if you decide to exclude this)
diff --git a/label_studio_ml/examples/flair/requirements.txt b/label_studio_ml/examples/flair/requirements.txt
@@ -1 +1,2 @@
+scipy==1.10.1
 flair==0.13.1
diff --git a/label_studio_ml/examples/grounding_dino/.dockerignore b/label_studio_ml/examples/grounding_dino/.dockerignore
@@ -1,2 +1,18 @@
-node_modules/*
-docker-data/*
+# Exclude everything
+**
+
+# Include Dockerfile and docker-compose for reference (optional, decide based on your use case)
+!Dockerfile
+!docker-compose.yml
+
+# Include Python application files
+!*.py
+
+# Include requirements files
+!requirements*.txt
+
+# Include script
+!*.sh
+
+# Exclude specific requirements if necessary
+# requirements-test.txt (Uncomment if you decide to exclude this)
diff --git a/label_studio_ml/examples/grounding_dino/dino.py b/label_studio_ml/examples/grounding_dino/dino.py
@@ -6,7 +6,6 @@
 from typing import List, Dict, Optional
 from uuid import uuid4
 from label_studio_ml.model import LabelStudioMLBase
-from label_studio_ml.utils import get_image_local_path
 from segment_anything.utils.transforms import ResizeLongestSide
 
 from groundingdino.util.inference import load_model, load_image, predict, annotate
@@ -80,8 +79,12 @@ def predict_batch(
 
 BOX_THRESHOLD = os.environ.get("BOX_THRESHOLD", 0.3)
 TEXT_THRESHOLD = os.environ.get("TEXT_THRESHOLD", 0.25)
-LABEL_STUDIO_ACCESS_TOKEN = os.environ.get("LABEL_STUDIO_ACCESS_TOKEN")
-LABEL_STUDIO_HOST = os.environ.get("LABEL_STUDIO_HOST")
+LABEL_STUDIO_ACCESS_TOKEN = (
+        os.environ.get("LABEL_STUDIO_ACCESS_TOKEN") or os.environ.get("LABEL_STUDIO_API_KEY")
+)
+LABEL_STUDIO_HOST = (
+        os.environ.get("LABEL_STUDIO_HOST") or os.environ.get("LABEL_STUDIO_URL")
+)
 
 USE_SAM = os.environ.get("USE_SAM", False)
 USE_MOBILE_SAM = os.environ.get("USE_MOBILE_SAM", False)
@@ -170,19 +173,18 @@ def one_task(self, task):
         all_points = []
         all_scores = []
         all_lengths = []
-
         predictions = []
-
-
         raw_img_path = task['data']['image']
 
         try:
-            img_path = get_image_local_path(
+            img_path = self.get_local_path(
                 raw_img_path,
-                label_studio_access_token=LABEL_STUDIO_ACCESS_TOKEN,
-                label_studio_host=LABEL_STUDIO_HOST
+                ls_access_token=LABEL_STUDIO_ACCESS_TOKEN,
+                ls_host=LABEL_STUDIO_HOST,
+                task_id=task.get('id')
             )
-        except:
+        except Exception as e:
+            logger.error(f"Error getting image path: {e}")
             img_path = raw_img_path
 
         src, img = load_image(img_path)
@@ -223,12 +225,14 @@ def multiple_tasks(self, tasks):
             raw_img_path = task['data']['image']
 
             try:
-                img_path = get_image_local_path(
+                img_path = self.get_local_path(
                     raw_img_path,
-                    label_studio_access_token=LABEL_STUDIO_ACCESS_TOKEN,
-                    label_studio_host=LABEL_STUDIO_HOST
+                    ls_access_token=LABEL_STUDIO_ACCESS_TOKEN,
+                    ls_host=LABEL_STUDIO_HOST,
+                    task_id=task.get('id')
                 )
-            except:
+            except Exception as e:
+                logger.error(f"Error getting local path: {e}")
                 img_path = raw_img_path
 
             image_paths.append(img_path)

diff --git a/label_studio_ml/examples/huggingface_llm/.dockerignore b/label_studio_ml/examples/huggingface_llm/.dockerignore
@@ -0,0 +1,18 @@
+# Exclude everything
+**
+
+# Include Dockerfile and docker-compose for reference (optional, decide based on your use case)
+!Dockerfile
+!docker-compose.yml
+
+# Include Python application files
+!*.py
+
+# Include requirements files
+!requirements*.txt
+
+# Include script
+!*.sh
+
+# Exclude specific requirements if necessary
+# requirements-test.txt (Uncomment if you decide to exclude this)
diff --git a/label_studio_ml/examples/huggingface_ner/.dockerignore b/label_studio_ml/examples/huggingface_ner/.dockerignore
@@ -0,0 +1,18 @@
+# Exclude everything
+**
+
+# Include Dockerfile and docker-compose for reference (optional, decide based on your use case)
+!Dockerfile
+!docker-compose.yml
+
+# Include Python application files
+!*.py
+
+# Include requirements files
+!requirements*.txt
+
+# Include script
+!*.sh
+
+# Exclude specific requirements if necessary
+# requirements-test.txt (Uncomment if you decide to exclude this)