diff --git a/docker/Dockerfile b/docker/Dockerfile index 449fb913..60b6ed14 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -2,65 +2,34 @@ FROM continuumio/miniconda3 MAINTAINER Gilles Bodart <gillesbodart@users.noreply.github.com> -RUN conda create -n env python=3.6 -RUN echo "source activate env" > ~/.bashrc -ENV PATH /opt/conda/envs/env/bin:$PATH +# Define the argument for language +ARG lang -RUN apt-get -qq -y update -RUN apt-get -qq -y upgrade -RUN apt-get -qq -y install \ - gcc \ - g++ \ - wget \ - curl \ - git \ - make \ - unzip \ - sudo \ - vim +# Install build-essential (compiler and development tools) +RUN apt-get update && \ + apt-get install -y build-essential && \ + rm -rf /var/lib/apt/lists/* -# Use C.UTF-8 locale to avoid issues with ASCII encoding -ENV LC_ALL=C.UTF-8 -ENV LANG=C.UTF-8 +RUN conda create -n env python=3.8 +RUN echo "source activate env" > ~/.bashrc +ENV PATH /opt/conda/envs/env/bin:$PATH # Set the working directory to /app WORKDIR /app -COPY ./requirements.txt /app/requirements.txt - -# Install any needed packages specified in requirements.txt -RUN pip install --trusted-host pypi.python.org -r requirements.txt --verbose - - -# Download LASER from FB -RUN git clone https://github.com/facebookresearch/LASER.git - -ENV LASER /app/LASER -WORKDIR $LASER - -RUN bash ./install_models.sh - - -#Installing FAISS - -RUN conda install --name env -c pytorch faiss-cpu -y - -RUN bash ./install_external_tools.sh - -COPY ./decode.py $LASER/tasks/embed/decode.py - - -# Make port 80 available to the world outside this container -WORKDIR /app +# Copy the local laser-encoders repository +COPY laser_encoders /app/laser_encoders +COPY pyproject.toml /app/ -RUN echo "Hello World" > test.txt +RUN pip install --upgrade pip +RUN pip install . --verbose -RUN $LASER/tasks/embed/embed.sh test.txt en test_embed.raw -RUN python $LASER/tasks/embed/decode.py test_embed.raw +# Download language models based on the specified language +RUN python -m laser_encoders.download_models --lang=$lang -#Open the port 80 +# Open the port 80 EXPOSE 80 -COPY ./app.py /app/app.py +COPY docker/app.py /app/app.py CMD ["/bin/bash"] diff --git a/docker/README.md b/docker/README.md index 57e18c82..41da3142 100644 --- a/docker/README.md +++ b/docker/README.md @@ -1,19 +1,62 @@ -## Docker +## LASER Docker Image -An image docker has been created to help you with the settings of an environment here are the step to follow : +This image provides a convenient way to run LASER in a Docker container. +To build the image, run the following command from the root of the LASER directory: -* Open a command prompt on the root of your LASER project -* Execute the command `docker build --tag=laser docker` -* Once the image is built run `docker run -it laser` +``` +docker build --tag=laser docker +``` +Once the image is built, you can run it with the following command: -A REST server on top of the embed task is under developement, -to run it you'll have to expose a local port [CHANGEME_LOCAL_PORT] by executing the next line instead of the last command. It'll overinde the command line entrypoint of your docker container. +``` +docker run -it laser +``` +**Note:** If you want to expose a local port to the REST server on top of the embed task, you can do so by executing the following command instead of the last command: -* `docker run -p [CHANGEME_LOCAL_PORT]:80 -it laser python app.py` +``` +docker run -it -p [CHANGEME_LOCAL_PORT]:80 laser python app.py +``` +This will override the command line entrypoint of the Docker container. + +Example: + +``` +docker run -it -p 8081:80 laser python app.py +``` This Flask server will serve a REST Api that can be use by calling your server with this URL : -* http://127.0.0.1:[CHANGEME_LOCAL_PORT]/vectorize?q=[YOUR_SENTENCE_URL_ENCODED]&lang=[LANGUAGE] +``` +http://127.0.0.1:[CHANGEME_LOCAL_PORT]/vectorize?q=[YOUR_SENTENCE_URL_ENCODED]&lang=[LANGUAGE] +``` + +Example: + +``` +http://127.0.0.1:8081/vectorize?q=ki%20lo%20'orukọ%20ẹ&lang=yor +``` + +Sample response: +``` +{ + "content": "ki lo 'orukọ ẹ", + "embedding": [ + [ + -0.10241681337356567, + 0.11120740324258804, + -0.26641348004341125, + -0.055699944496154785, + .... + .... + .... + -0.034048307687044144, + 0.11005636304616928, + -0.3238321840763092, + -0.060631975531578064, + -0.19269055128097534, + ] +} +``` Here is an example of how you can send requests to it with python: diff --git a/docker/app.py b/docker/app.py index a5574b9a..cbb85a41 100644 --- a/docker/app.py +++ b/docker/app.py @@ -1,78 +1,44 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -from flask import Flask, request, jsonify import os import socket -import tempfile -from pathlib import Path + import numpy as np -from LASER.source.lib.text_processing import Token, BPEfastApply -from LASER.source.embed import * +from flask import Flask, jsonify, request +from laser_encoders import LaserEncoderPipeline app = Flask(__name__) -app.config['JSON_AS_ASCII'] = False @app.route("/") def root(): print("/") - html = "<h3>Hello {name}!</h3>" \ - "<b>Hostname:</b> {hostname}<br/>" + html = "<h3>Hello {name}!</h3>" "<b>Hostname:</b> {hostname}<br/>" return html.format(name=os.getenv("LASER", "world"), hostname=socket.gethostname()) -@app.route("/vectorize") +@app.route("/vectorize", methods=["GET"]) def vectorize(): - content = request.args.get('q') - lang = request.args.get('lang') - embedding = '' - if lang is None or not lang: - lang = "en" - # encoder - model_dir = Path(__file__).parent / "LASER" / "models" - encoder_path = model_dir / "bilstm.93langs.2018-12-26.pt" - bpe_codes_path = model_dir / "93langs.fcodes" - print(f' - Encoder: loading {encoder_path}') - encoder = SentenceEncoder(encoder_path, - max_sentences=None, - max_tokens=12000, - sort_kind='mergesort', - cpu=True) - with tempfile.TemporaryDirectory() as tmp: - tmpdir = Path(tmp) - ifname = tmpdir / "content.txt" - bpe_fname = tmpdir / 'bpe' - bpe_oname = tmpdir / 'out.raw' - with ifname.open("w") as f: - f.write(content) - if lang != '--': - tok_fname = tmpdir / "tok" - Token(str(ifname), - str(tok_fname), - lang=lang, - romanize=True if lang == 'el' else False, - lower_case=True, - gzip=False, - verbose=True, - over_write=False) - ifname = tok_fname - BPEfastApply(str(ifname), - str(bpe_fname), - str(bpe_codes_path), - verbose=True, over_write=False) - ifname = bpe_fname - EncodeFile(encoder, - str(ifname), - str(bpe_oname), - verbose=True, - over_write=False, - buffer_size=10000) - dim = 1024 - X = np.fromfile(str(bpe_oname), dtype=np.float32, count=-1) - X.resize(X.shape[0] // dim, dim) - embedding = X - body = {'content': content, 'embedding': embedding.tolist()} - return jsonify(body) + content = request.args.get("q") + lang = request.args.get("lang", "en") # Default to English if 'lang' is not provided + + if content is None: + return jsonify({"error": "Missing input content"}), 400 + + try: + encoder = LaserEncoderPipeline(lang=lang) + embeddings = encoder.encode_sentences([content]) + embeddings_list = embeddings.tolist() + body = {"content": content, "embedding": embeddings_list} + return jsonify(body), 200 + + except ValueError as e: + # Check if the exception is due to an unsupported language + if "unsupported language" in str(e).lower(): + return jsonify({"error": f"Language '{lang}' is not supported."}), 400 + else: + return jsonify({"error": str(e)}), 400 + if __name__ == "__main__": - app.run(debug=True, port=80, host='0.0.0.0') + app.run(debug=True, port=80, host="0.0.0.0") diff --git a/docker/requirements.txt b/docker/requirements.txt index 2b38074b..21424171 100644 --- a/docker/requirements.txt +++ b/docker/requirements.txt @@ -1,6 +1,14 @@ -Flask -scipy -numpy -Cython +fairseq==0.12.2 +numpy==1.25.0 +pytest==7.4.0 +Requests==2.31.0 +sacremoses==0.0.53 +sentencepiece==0.1.99 +tqdm==4.65.0 +Flask==2.3.3 + +--extra-index-url https://download.pytorch.org/whl/cpu torch -transliterate \ No newline at end of file + +--extra-index-url https://test.pypi.org/simple/ +laser-encoders==0.0.3