From 94fce8d0d8fdf28819b738114bcfb1cb3e6f240f Mon Sep 17 00:00:00 2001 From: Marc Alloul Date: Wed, 13 Nov 2024 13:53:43 -0500 Subject: [PATCH 01/12] chore: initial commit of circle config --- .circleci/config.yml | 72 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 72 insertions(+) create mode 100644 .circleci/config.yml diff --git a/.circleci/config.yml b/.circleci/config.yml new file mode 100644 index 0000000..522e074 --- /dev/null +++ b/.circleci/config.yml @@ -0,0 +1,72 @@ +version: 2.1 +workflows: + build: + # when: << pipeline.parameters.run_delete_environment_workflow >> + jobs: + - build-docker-image: + context: + - org-global +commands: + - &configure_buildx_context + name: Configure remote docker buildx context + command: | + docker buildx create --name remote-kubernetes --driver remote --driver-opt cacert=/certs/ca.pem,cert=/certs/cert.pem,key=/certs/key.pem tcp://buildkitd-0.buildkitd-headless.circleci.svc.cluster.local:1234 + docker buildx use remote-kubernetes + + - &gcp_auth + name: "Authenticate to Google Cloud" + command: | + # Snippet from https://discuss.circleci.com/t/walk-through-oidc-to-gcp/44224 + # Configures application default credentials without requiring gcloud to be installed + GCP_OIDC_AUDIENCE="projects/${GOOGLE_PROJECT_ID}/locations/global/workloadIdentityPools/${OIDC_WIP_ID}/providers/${OIDC_WIP_PROVIDER_ID}" + GCP_IMPERSONATION_URL="https://iamcredentials.googleapis.com/v1/projects/-/serviceAccounts/${OIDC_SERVICE_ACCOUNT_EMAIL}:generateAccessToken" + + mkdir -p ~/.config/gcloud + echo "${CIRCLE_OIDC_TOKEN}" > $HOME/.config/gcloud/oidc_token + + cat >> $HOME/.config/gcloud/application_default_credentials.json \<<- EOF + { + "type": "external_account", + "audience": "//iam.googleapis.com/${GCP_OIDC_AUDIENCE}", + "subject_token_type": "urn:ietf:params:oauth:token-type:jwt", + "token_url": "https://sts.googleapis.com/v1/token", + "credential_source": { + "file": "$HOME/.config/gcloud/oidc_token" + }, + "service_account_impersonation_url": "${GCP_IMPERSONATION_URL}" + } + EOF + - &configure_docker_credential + name: Install docker credential helper + command: | + mkdir -p ~/bin + pushd ~/bin + curl -L https://github.com/GoogleCloudPlatform/docker-credential-gcr/releases/download/v2.1.0/docker-credential-gcr_linux_amd64-2.1.0.tar.gz | tar zxv + popd + export PATH=$PATH:~/bin + mkdir -p $HOME/.config/gcloud/ + echo ${GOOGLE_AUTH} > $HOME/.config/gcloud/application_default_credentials.json + docker-credential-gcr configure-docker + + +jobs: + build-docker-image: + docker: + - image: docker:27-cli + # auth: *gcr_auth + resource_class: zia-ai/small + steps: + - checkout + - run: *configure_buildx_context + - run: *gcp_auth + - run: *configure_docker_credential + - run: + name: Build docker image + command: | + IMAGE_SHA1=$(echo $CIRCLE_SHA1 | cut -c -7) + IMAGE_BRANCH=$(echo $CIRCLE_BRANCH | sed -e 's/\//\-/g') + IMAGE_TAG="${CIRCLE_TAG:-$IMAGE_BRANCH}" + + docker build -t test-integration:$IMAGE_TAG . + # docker build -t $IMAGE_URL/$IMAGE_NAME:$IMAGE_TAG . + # docker push $IMAGE_URL/$IMAGE_NAME:$IMAGE_TAG From 17a2a377d5fde88811b767c0753b16f6be1a3f9f Mon Sep 17 00:00:00 2001 From: Marc Alloul Date: Wed, 13 Nov 2024 13:57:30 -0500 Subject: [PATCH 02/12] fix: correct term is alises not commands --- .circleci/config.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 522e074..4129e56 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -6,7 +6,7 @@ workflows: - build-docker-image: context: - org-global -commands: +aliases: - &configure_buildx_context name: Configure remote docker buildx context command: | From 1936905c8edf48a1860181eb6ebd75b0e9629225 Mon Sep 17 00:00:00 2001 From: Marc Alloul Date: Wed, 13 Nov 2024 14:06:00 -0500 Subject: [PATCH 03/12] fix: docker container has no pushd preinstalled --- .circleci/config.yml | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 4129e56..f54cb60 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -39,10 +39,8 @@ aliases: - &configure_docker_credential name: Install docker credential helper command: | - mkdir -p ~/bin - pushd ~/bin - curl -L https://github.com/GoogleCloudPlatform/docker-credential-gcr/releases/download/v2.1.0/docker-credential-gcr_linux_amd64-2.1.0.tar.gz | tar zxv - popd + mkdir -p ~/bin && cd ~/bin + curl -L https://github.com/GoogleCloudPlatform/docker-credential-gcr/releases/download/v2.1.0/docker-credential-gcr_linux_amd64-2.1.0.tar.gz | tar zxv export PATH=$PATH:~/bin mkdir -p $HOME/.config/gcloud/ echo ${GOOGLE_AUTH} > $HOME/.config/gcloud/application_default_credentials.json From 039e2ef0a71ec07be0b24abb3b3de7d8c60c8cb6 Mon Sep 17 00:00:00 2001 From: Marc Alloul Date: Wed, 13 Nov 2024 14:11:25 -0500 Subject: [PATCH 04/12] chore: test using docker image --- .circleci/config.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.circleci/config.yml b/.circleci/config.yml index f54cb60..8e50f11 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -39,6 +39,7 @@ aliases: - &configure_docker_credential name: Install docker credential helper command: | + apk add --no-cache curl mkdir -p ~/bin && cd ~/bin curl -L https://github.com/GoogleCloudPlatform/docker-credential-gcr/releases/download/v2.1.0/docker-credential-gcr_linux_amd64-2.1.0.tar.gz | tar zxv export PATH=$PATH:~/bin @@ -51,6 +52,7 @@ jobs: build-docker-image: docker: - image: docker:27-cli + entrypoint: /bin/sh # auth: *gcr_auth resource_class: zia-ai/small steps: From fe6646b4af37fd3971a14ba8cddf0711eb3caa61 Mon Sep 17 00:00:00 2001 From: Marc Alloul Date: Wed, 13 Nov 2024 14:22:44 -0500 Subject: [PATCH 05/12] build: testing buildkit container --- .circleci/config.yml | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 8e50f11..064964a 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -51,13 +51,13 @@ aliases: jobs: build-docker-image: docker: - - image: docker:27-cli + - image: moby/buildkit:master entrypoint: /bin/sh # auth: *gcr_auth resource_class: zia-ai/small steps: - checkout - - run: *configure_buildx_context + # - run: *configure_buildx_context - run: *gcp_auth - run: *configure_docker_credential - run: @@ -66,7 +66,13 @@ jobs: IMAGE_SHA1=$(echo $CIRCLE_SHA1 | cut -c -7) IMAGE_BRANCH=$(echo $CIRCLE_BRANCH | sed -e 's/\//\-/g') IMAGE_TAG="${CIRCLE_TAG:-$IMAGE_BRANCH}" + buildctl \ + --addr tcp://buildkitd-0.buildkitd-headless.circleci.svc.cluster.local:1234 \ + --tlscacert /certs/ca.pem \ + --tlscert /certs/cert.pem \ + --tlskey /certs/key.pem \ + build --frontend dockerfile.v0 --local context=. --local dockerfile=. --output type=image,name=test-integration:$IMAGE_TAG - docker build -t test-integration:$IMAGE_TAG . + # docker build -t test-integration:$IMAGE_TAG . # docker build -t $IMAGE_URL/$IMAGE_NAME:$IMAGE_TAG . # docker push $IMAGE_URL/$IMAGE_NAME:$IMAGE_TAG From 6742e5a76a478afc27d289064c4537cf710e8491 Mon Sep 17 00:00:00 2001 From: Marc Alloul Date: Wed, 13 Nov 2024 14:27:45 -0500 Subject: [PATCH 06/12] build: test pushing --- .circleci/config.yml | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 064964a..ec0f4d7 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -40,9 +40,8 @@ aliases: name: Install docker credential helper command: | apk add --no-cache curl - mkdir -p ~/bin && cd ~/bin + cd /usr/local/bin curl -L https://github.com/GoogleCloudPlatform/docker-credential-gcr/releases/download/v2.1.0/docker-credential-gcr_linux_amd64-2.1.0.tar.gz | tar zxv - export PATH=$PATH:~/bin mkdir -p $HOME/.config/gcloud/ echo ${GOOGLE_AUTH} > $HOME/.config/gcloud/application_default_credentials.json docker-credential-gcr configure-docker @@ -66,12 +65,14 @@ jobs: IMAGE_SHA1=$(echo $CIRCLE_SHA1 | cut -c -7) IMAGE_BRANCH=$(echo $CIRCLE_BRANCH | sed -e 's/\//\-/g') IMAGE_TAG="${CIRCLE_TAG:-$IMAGE_BRANCH}" + IMAGE_NAME="clu-integration" buildctl \ --addr tcp://buildkitd-0.buildkitd-headless.circleci.svc.cluster.local:1234 \ --tlscacert /certs/ca.pem \ --tlscert /certs/cert.pem \ --tlskey /certs/key.pem \ - build --frontend dockerfile.v0 --local context=. --local dockerfile=. --output type=image,name=test-integration:$IMAGE_TAG + build --frontend dockerfile.v0 --local context=. --local dockerfile=. --output type=image,name=$IMAGE_URL/$IMAGE_NAME:$IMAGE_TAG,push=true --output type=image,name=$IMAGE_URL/$IMAGE_NAME:$IMAGE_SHA1,push=true + # docker build -t test-integration:$IMAGE_TAG . # docker build -t $IMAGE_URL/$IMAGE_NAME:$IMAGE_TAG . From 5e18417ce627d3f6bc98d37604693ccee6cd9186 Mon Sep 17 00:00:00 2001 From: Marc Alloul Date: Thu, 14 Nov 2024 10:00:28 -0500 Subject: [PATCH 07/12] chore: use cimg --- .circleci/config.yml | 26 ++++++++------------------ 1 file changed, 8 insertions(+), 18 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index ec0f4d7..ed1b3b5 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -39,9 +39,11 @@ aliases: - &configure_docker_credential name: Install docker credential helper command: | - apk add --no-cache curl - cd /usr/local/bin - curl -L https://github.com/GoogleCloudPlatform/docker-credential-gcr/releases/download/v2.1.0/docker-credential-gcr_linux_amd64-2.1.0.tar.gz | tar zxv + mkdir -p ~/bin + pushd ~/bin + curl -L https://github.com/GoogleCloudPlatform/docker-credential-gcr/releases/download/v2.1.0/docker-credential-gcr_linux_amd64-2.1.0.tar.gz | tar zxv + popd + export PATH=$PATH:~/bin mkdir -p $HOME/.config/gcloud/ echo ${GOOGLE_AUTH} > $HOME/.config/gcloud/application_default_credentials.json docker-credential-gcr configure-docker @@ -50,13 +52,11 @@ aliases: jobs: build-docker-image: docker: - - image: moby/buildkit:master - entrypoint: /bin/sh - # auth: *gcr_auth + - image: cimg/base:current-22.04 resource_class: zia-ai/small steps: - checkout - # - run: *configure_buildx_context + - run: *configure_buildx_context - run: *gcp_auth - run: *configure_docker_credential - run: @@ -66,14 +66,4 @@ jobs: IMAGE_BRANCH=$(echo $CIRCLE_BRANCH | sed -e 's/\//\-/g') IMAGE_TAG="${CIRCLE_TAG:-$IMAGE_BRANCH}" IMAGE_NAME="clu-integration" - buildctl \ - --addr tcp://buildkitd-0.buildkitd-headless.circleci.svc.cluster.local:1234 \ - --tlscacert /certs/ca.pem \ - --tlscert /certs/cert.pem \ - --tlskey /certs/key.pem \ - build --frontend dockerfile.v0 --local context=. --local dockerfile=. --output type=image,name=$IMAGE_URL/$IMAGE_NAME:$IMAGE_TAG,push=true --output type=image,name=$IMAGE_URL/$IMAGE_NAME:$IMAGE_SHA1,push=true - - - # docker build -t test-integration:$IMAGE_TAG . - # docker build -t $IMAGE_URL/$IMAGE_NAME:$IMAGE_TAG . - # docker push $IMAGE_URL/$IMAGE_NAME:$IMAGE_TAG + docker buildx build -t $IMAGE_URL/$IMAGE_NAME:$IMAGE_TAG -t $IMAGE_URL/$IMAGE_NAME:$IMAGE_SHA1 --push . From 2613760a1ae3d6f8a0c778ecd7255271839add00 Mon Sep 17 00:00:00 2001 From: Marc Alloul Date: Thu, 14 Nov 2024 10:21:37 -0500 Subject: [PATCH 08/12] chore: copy current working dir --- .circleci/config.yml | 2 +- .dockerignore | 27 +++++++++++++++++++++++++++ Dockerfile | 5 +++-- 3 files changed, 31 insertions(+), 3 deletions(-) create mode 100644 .dockerignore diff --git a/.circleci/config.yml b/.circleci/config.yml index ed1b3b5..be5f576 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -66,4 +66,4 @@ jobs: IMAGE_BRANCH=$(echo $CIRCLE_BRANCH | sed -e 's/\//\-/g') IMAGE_TAG="${CIRCLE_TAG:-$IMAGE_BRANCH}" IMAGE_NAME="clu-integration" - docker buildx build -t $IMAGE_URL/$IMAGE_NAME:$IMAGE_TAG -t $IMAGE_URL/$IMAGE_NAME:$IMAGE_SHA1 --push . + docker buildx build -t $IMAGE_URL/$IMAGE_NAME:$IMAGE_TAG -t $IMAGE_URL/$IMAGE_NAME:$IMAGE_SHA1 --progress plain --push . diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..a3b2c1f --- /dev/null +++ b/.dockerignore @@ -0,0 +1,27 @@ + +.git +.github +.dockerignore +.gitignore + +.idea +.vscode + +__pycache__/ +*.py[cod] +*$py.class +*.so +htmlcov/ +.coverage +.coverage.* +.pytest_cache/ +.venv +venv + +.DS_Store +.AppleDouble +.LSOverride +._* + +.vscode +.idea diff --git a/Dockerfile b/Dockerfile index a1db112..17e78e6 100644 --- a/Dockerfile +++ b/Dockerfile @@ -11,9 +11,10 @@ WORKDIR /src # RUN git clone -b --single-branch https://github.com/zia-ai/hf-custom-integration.git . # Clone from master branch -RUN git clone https://github.com/zia-ai/hf-custom-integration.git . +# RUN git clone https://github.com/zia-ai/hf-custom-integration.git . +COPY . /src # Generate MTLS credentials from the commands given in the README.md -COPY ./credentials /src/credentials +# COPY ./credentials /src/credentials RUN poetry config virtualenvs.create false && poetry install From b5d9b04d27cf1d3749633f0b5db6655900eb3758 Mon Sep 17 00:00:00 2001 From: Marc Alloul Date: Thu, 14 Nov 2024 15:23:55 -0500 Subject: [PATCH 09/12] chore: only build when version tag pushed --- .circleci/config.yml | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index be5f576..4c1f573 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -1,11 +1,17 @@ version: 2.1 workflows: build: - # when: << pipeline.parameters.run_delete_environment_workflow >> jobs: - build-docker-image: context: - org-global + filters: + tags: + only: /^v.*/ + branches: + ignore: /.*/ + + aliases: - &configure_buildx_context name: Configure remote docker buildx context From 7d993bc2248456502f05c470ee0607d0fa24b6a9 Mon Sep 17 00:00:00 2001 From: Mohammed Fayaz Ansar Jelani Date: Tue, 26 Nov 2024 05:17:10 +0000 Subject: [PATCH 10/12] delimiter fix --- README.md | 16 +++++----------- hf_integration/clu_converters.py | 22 +++++++++++++++++----- hf_integration/model_clu.py | 16 ++++++++++++---- hf_integration/workspace_clu.py | 32 ++++++++++++++++++++++++++++---- 4 files changed, 62 insertions(+), 24 deletions(-) diff --git a/README.md b/README.md index d2071fa..71dc4f0 100644 --- a/README.md +++ b/README.md @@ -121,20 +121,14 @@ Example command installing CLI-1.35.0 7. Get the Azure endpoint and API key using https://portal.azure.com/ Go to resource Find Endpoint and keys - -8. Set HumanFirst environment variables - ``` - export HF_USERNAME="" - export HF_PASSWORD="" - ``` -9. Set environment variables for running CLU integration +8. Set environment variables for running CLU integration ``` export CLU_ENDPOINT="" export CLU_KEY="" ``` **Note: In case of restarting the instance, ensure to run the follwoing command again - `sudo sysctl -w net.ipv4.ip_unprivileged_port_start=443`** -10. Launch the integration service: +9. Launch the integration service: ``` poetry run python3 -m hf_integration.main ./credentials/mtls-credentials.json 0.0.0.0:443 "" ``` @@ -144,7 +138,7 @@ Example command installing CLI-1.35.0 poetry run python3 -m hf_integration.main ./credentials/my_org-mtls-credentials.json 0.0.0.0:443 clu "clu_endpoint::$CLU_ENDPOINT,clu_key::$CLU_KEY,delimiter::-,project_path::/home/FayazJelani/hf-custom-integration,clu_language::ja,clu_multilingual::True,clu_training_mode::advanced,log_level::debug" ``` -11. IF the IP address of the integration server changes, then use the following command to set the IP address of the integration server in the HF +10. IF the IP address of the integration server changes, then use the following command to set the IP address of the integration server in the HF `hf integrations --id intg-id-here set-address -a :443` ## Docker @@ -208,14 +202,14 @@ Follow the steps here - https://www.notion.so/humanfirst/Custom-NLU-d4bb84f08676 ### Create, attach and run the commands manually ``` -sudo docker run -e "CLU_KEY=$CLU_KEY" -e "CLU_ENDPOINT=$CLU_ENDPOINT" -e "HF_USERNAME=$HF_USERNAME" -e "HF_PASSWORD=$HF_PASSWORD" -d --name clu-custom-connector-0 -p 443:443 clu-custom-connector tail -f /dev/null +sudo docker run -e "CLU_KEY=$CLU_KEY" -e "CLU_ENDPOINT=$CLU_ENDPOINT" -d --name clu-custom-connector-0 -p 443:443 clu-custom-connector tail -f /dev/null sudo docker exec -it clu-custom-connector-0 /bin/bash poetry run python3 -m hf_integration.main ./credentials/my_org-mtls-credentials.json 0.0.0.0:443 clu "clu_endpoint::$CLU_ENDPOINT,clu_key::$CLU_KEY,delimiter::-,clu_language::en-us,clu_multilingual::True,clu_training_mode::standard,max_batch_size::500" ### Run the commands while creating the container -sudo docker run -e "CLU_KEY=$CLU_KEY" -e "CLU_ENDPOINT=$CLU_ENDPOINT" -e "HF_USERNAME=$HF_USERNAME" -e "HF_PASSWORD=$HF_PASSWORD" -d --name clu-custom-connector-0 -p 443:443 clu-custom-connector poetry run python3 -m hf_integration.main ./credentials/my_org-mtls-credentials.json 0.0.0.0:443 clu "clu_endpoint::$CLU_ENDPOINT,clu_key::$CLU_KEY,delimiter::-,clu_language::en-us,clu_multilingual::True,clu_training_mode::standard,max_batch_size::500" +sudo docker run -e "CLU_KEY=$CLU_KEY" -e "CLU_ENDPOINT=$CLU_ENDPOINT" -d --name clu-custom-connector-0 -p 443:443 clu-custom-connector poetry run python3 -m hf_integration.main ./credentials/my_org-mtls-credentials.json 0.0.0.0:443 clu "clu_endpoint::$CLU_ENDPOINT,clu_key::$CLU_KEY,delimiter::-,clu_language::en-us,clu_multilingual::True,clu_training_mode::standard,max_batch_size::500" ### Free up the port 443 sudo kill -9 $(sudo lsof -t -i :443) diff --git a/hf_integration/clu_converters.py b/hf_integration/clu_converters.py index 44d2bf2..4628674 100644 --- a/hf_integration/clu_converters.py +++ b/hf_integration/clu_converters.py @@ -137,7 +137,7 @@ class clu_to_hf_converter: def clu_to_hf_process( self, clu_json: dict, - delimiter: str = "-", + delimiter: str, language: str = "en-us") -> None: # TODO: note potential clashes with utf16 and utf8 in future depending on PVA @@ -299,7 +299,10 @@ def clu_to_hf_entity_mapper(self, clu_entity_object: dict, language: str) -> dic def clu_to_hf_intent_mapper(self, intent_name: str, hf_workspace: humanfirst.objects.HFWorkspace, delimiter: str) -> None: """Builds the parent and child structures for an intent name""" # clu doesn't have separate IDs (current understanding) - intent_hierarchy = intent_name.split(delimiter) + if delimiter != "": + intent_hierarchy = intent_name.split(delimiter) + else: + intent_hierarchy = intent_name hf_workspace.intent(intent_hierarchy) def clu_to_hf_utterance_mapper(self, @@ -309,7 +312,12 @@ def clu_to_hf_utterance_mapper(self, delimiter: str) -> None: """Builds HF example""" fully_qualified_intent_name = str(row["intent"]) - intent_hierarchy = fully_qualified_intent_name.split(delimiter) + + if delimiter != "": + intent_hierarchy = fully_qualified_intent_name.split(delimiter) + else: + intent_hierarchy = fully_qualified_intent_name + try: tag_name = row["dataset"] if pandas.isna(tag_name): @@ -327,8 +335,8 @@ class hf_to_clu_converter: def hf_to_clu_process(self, hf_json: dict, clu_json: dict, + delimiter: str, language: str = "en-us", - delimiter: str = "-", skip: bool = False) -> None: """Process HF to CLU conversion""" @@ -337,7 +345,11 @@ def hf_to_clu_process(self, # get a HFWorkspace object to get fully qualified intent names # logger.info("delimiter blah blah") logger.info(f"Delimiter {delimiter}") - hf_workspace = humanfirst.objects.HFWorkspace.from_json(hf_json,delimiter) + + if delimiter != "": + hf_workspace = humanfirst.objects.HFWorkspace.from_json(hf_json,delimiter=delimiter) + else: + hf_workspace = humanfirst.objects.HFWorkspace.from_json(hf_json,delimiter=None) # get the tag for Test dataset test_tag_id = None diff --git a/hf_integration/model_clu.py b/hf_integration/model_clu.py index 96b0e8a..e90f8a6 100644 --- a/hf_integration/model_clu.py +++ b/hf_integration/model_clu.py @@ -174,6 +174,11 @@ def __init__(self, config: dict) -> None: if self.config["max_batch_size"] <= 0: raise RuntimeError(f'Max Batch Size cannot be less than or qual to 0') + # check for delimiter + if "delimiter" in self.config: + if self.config["delimiter"] != "": + self.format_options.hierarchical_delimiter=self.config["delimiter"] + def _flip_dict(self, input_dict, delimiter): # Ensure that all values in the original dictionary are unique @@ -280,7 +285,9 @@ def on_cancel(): namespace=request.namespace, integration_id=request.integration_id, data=request.data, - workspace_id=project_name + workspace_id=project_name, + data_format=self.data_format, + format_options=self.format_options ) hf_file_path = os.path.join(self.snapshot_path, "import", f"{timestamp}_hf_{request.namespace}_{project_name}.json") @@ -449,10 +456,11 @@ async def _Classify(self, request: models_pb2.ClassifyRequest, context) -> model with open(self.handle_map[request.model_id]["hf_file_path"], mode="r", encoding="utf8") as f: hf_json = json.load(f) - hf_workspace = humanfirst.objects.HFWorkspace.from_json(hf_json, self.config["delimiter"]) + hf_workspace = humanfirst.objects.HFWorkspace.from_json(hf_json, + self.format_options.hierarchical_delimiter) intent_index = self._flip_dict(hf_workspace.get_intent_index( - delimiter=self.config["delimiter"]), - delimiter=self.config["delimiter"] + delimiter=self.format_options.hierarchical_delimiter), + delimiter=self.format_options.hierarchical_delimiter ) predictions = [] predict_results = [] diff --git a/hf_integration/workspace_clu.py b/hf_integration/workspace_clu.py index d22911e..c30e44f 100644 --- a/hf_integration/workspace_clu.py +++ b/hf_integration/workspace_clu.py @@ -147,6 +147,12 @@ def __init__(self, config: dict) -> None: self.multilingual = self.multilingual = {"True": True, "False": False}[self.config["clu_multilingual"]] + # check for delimiter + if "delimiter" in self.config: + if self.config["delimiter"] != "": + self.format_options.hierarchical_delimiter=self.config["delimiter"] + + def _write_json(self,path: str, data: dict ) -> None: with open(path,mode="w",encoding="utf8") as f: json.dump(data,f,indent=2) @@ -154,6 +160,9 @@ def _write_json(self,path: str, data: dict ) -> None: def ListWorkspaces(self, request: workspace_pb2.ListWorkspacesRequest, context) -> workspace_pb2.ListWorkspacesResponse: """List Workspaces""" + print("ListWorkspaces") + print(request) + workspaces = [] for project in self.clu_api.list_projects(): workspaces.append(workspace_pb2.Workspace(id=project, name=project)) @@ -163,6 +172,10 @@ def ListWorkspaces(self, request: workspace_pb2.ListWorkspacesRequest, context) def GetWorkspace(self, request: workspace_pb2.GetWorkspaceRequest, context) -> workspace_pb2.Workspace: """Get workspace""" + print("GetWorkspace") + + print(request) + if request.workspace_id in self.clu_api.list_projects(): return workspace_pb2.Workspace(id=request.workspace_id, name=request.workspace_id) else: @@ -172,6 +185,8 @@ def CreateWorkspace(self, request: workspace_pb2.CreateWorkspaceRequest, context """ Create a new workspace """ + print("CreateWorkspace") + print(request) self.clu_api.clu_create_project(project_name=request.workspace.name, des = request.workspace.description, language=self.language, @@ -185,14 +200,19 @@ def GetImportParameters(self, request: workspace_pb2.GetImportParametersRequest, In this case, we specifically request the HF json format """ - + print("GetImportParameters") + print(request) + # print(request.language_code) return workspace_pb2.GetImportParametersResponse(data_format=self.data_format, format_options=self.format_options) def ImportWorkspace(self, request: workspace_pb2.ImportWorkspaceRequest, context) -> workspace_pb2.ImportWorkspaceResponse: """ Import a workspace into the integration, from the provided data exported from Studio """ - + print("ImportWorkspace") + print(request) + print(f"Hierarchical Delimiter: {request.format_options.hierarchical_delimiter}") + # print(request.language_code) # Get the current timestamp timestamp = datetime.now().strftime("%Y%m%d%H%M%S") project_name = self.clu_api._remove_non_alphanumeric(input_string=request.workspace_id) @@ -241,7 +261,7 @@ def ImportWorkspace(self, request: workspace_pb2.ImportWorkspaceRequest, context clu_json = self.clu_converter.hf_to_clu_process( hf_json=hf_json, clu_json=clu_json, - delimiter=self.config["delimiter"], + delimiter=self.format_options.hierarchical_delimiter, language=self.language) self._write_json( @@ -258,6 +278,10 @@ def ExportWorkspace(self, request: workspace_pb2.ExportWorkspaceRequest, context Exports a workspace from the integration, importing it into Studio """ + print("ExportWorkspace") + print(request) + # print(request.language_code) + # Get the current timestamp timestamp = datetime.now().strftime("%Y%m%d%H%M%S") @@ -268,7 +292,7 @@ def ExportWorkspace(self, request: workspace_pb2.ExportWorkspaceRequest, context hf_json = self.clu_converter.clu_to_hf_process( clu_json=clu_project, - delimiter=self.config["delimiter"], + delimiter=self.format_options.hierarchical_delimiter, language=self.language) self._write_json( From 2e2a154d1df0e0445e42dfbc32e72a3e6b69dc69 Mon Sep 17 00:00:00 2001 From: Mohammed Fayaz Ansar Jelani Date: Wed, 11 Dec 2024 07:37:58 +0000 Subject: [PATCH 11/12] training delimiter --- README.md | 12 +++++++++++- hf_integration/model_clu.py | 24 ++++++++++++++++++------ hf_integration/workspace_clu.py | 6 +++--- 3 files changed, 32 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 71dc4f0..99e1f42 100644 --- a/README.md +++ b/README.md @@ -135,7 +135,17 @@ Example command installing CLI-1.35.0 Example: ``` - poetry run python3 -m hf_integration.main ./credentials/my_org-mtls-credentials.json 0.0.0.0:443 clu "clu_endpoint::$CLU_ENDPOINT,clu_key::$CLU_KEY,delimiter::-,project_path::/home/FayazJelani/hf-custom-integration,clu_language::ja,clu_multilingual::True,clu_training_mode::advanced,log_level::debug" + poetry run python3 -m hf_integration.main ./credentials/my_org-mtls-credentials.json 0.0.0.0:443 clu "clu_endpoint::$CLU_ENDPOINT,clu_key::$CLU_KEY,clu_language::en-us,clu_multilingual::True,clu_training_mode::standard,max_batch_size::500,training_delimiter::---" + + CLU config params: + clu_endpoint + clu_key + training_delimiter + workspace_delimiter + clu_language + clu_multilingual + clu_training_mode + max_batch_size ``` 10. IF the IP address of the integration server changes, then use the following command to set the IP address of the integration server in the HF diff --git a/hf_integration/model_clu.py b/hf_integration/model_clu.py index e90f8a6..419aeb0 100644 --- a/hf_integration/model_clu.py +++ b/hf_integration/model_clu.py @@ -32,6 +32,7 @@ TRAIN_SPLIT=100 MAX_BATCH_SIZE=1000 +TRAINING_DELIMITER = "---" CLU_SUPPORTED_LANGUAGE_CODES = [ "af", "am", "ar", "as", "az", "be", "bg", "bn", "br", "bs", "ca", "cs", "cy", "da", "de", "el", "en-us", "en-gb", "eo", "es", "et", "eu", "fa", @@ -147,7 +148,6 @@ def __init__(self, config: dict) -> None: super().__init__(config) self.clu_api = clu_apis(clu_endpoint=self.config["clu_endpoint"], clu_key=self.config["clu_key"]) - self.workspace = WorkspaceServiceCLU(config=config) # Check for language code support if self.config["clu_language"] in CLU_SUPPORTED_LANGUAGE_CODES: @@ -175,10 +175,18 @@ def __init__(self, config: dict) -> None: raise RuntimeError(f'Max Batch Size cannot be less than or qual to 0') # check for delimiter - if "delimiter" in self.config: - if self.config["delimiter"] != "": - self.format_options.hierarchical_delimiter=self.config["delimiter"] + if "training_delimiter" in self.config: + if self.config["training_delimiter"] != "": + self.format_options.hierarchical_delimiter=self.config["training_delimiter"] + self.config["workspace_delimiter"] = self.config["training_delimiter"] + else: + self.format_options.hierarchical_delimiter = TRAINING_DELIMITER + self.config["workspace_delimiter"] = TRAINING_DELIMITER + else: + self.format_options.hierarchical_delimiter = TRAINING_DELIMITER + self.config["workspace_delimiter"] = TRAINING_DELIMITER + self.workspace = WorkspaceServiceCLU(config=config) def _flip_dict(self, input_dict, delimiter): # Ensure that all values in the original dictionary are unique @@ -187,8 +195,12 @@ def _flip_dict(self, input_dict, delimiter): # Flip the dictionary flipped_dict = {} - for key, value in input_dict.items(): - flipped_dict[value] = [key, value.split(delimiter)[-1]] + if delimiter != "": + for key, value in input_dict.items(): + flipped_dict[value] = [key, value.split(delimiter)[-1]] + else: + for key, value in input_dict.items(): + flipped_dict[value] = [key, value] return flipped_dict diff --git a/hf_integration/workspace_clu.py b/hf_integration/workspace_clu.py index c30e44f..ebecb54 100644 --- a/hf_integration/workspace_clu.py +++ b/hf_integration/workspace_clu.py @@ -148,9 +148,9 @@ def __init__(self, config: dict) -> None: self.multilingual = self.multilingual = {"True": True, "False": False}[self.config["clu_multilingual"]] # check for delimiter - if "delimiter" in self.config: - if self.config["delimiter"] != "": - self.format_options.hierarchical_delimiter=self.config["delimiter"] + if "workspace_delimiter" in self.config: + if self.config["workspace_delimiter"] != "": + self.format_options.hierarchical_delimiter=self.config["workspace_delimiter"] def _write_json(self,path: str, data: dict ) -> None: From a2f34435ef540851b79647be323af933504e5b76 Mon Sep 17 00:00:00 2001 From: Mohammed Fayaz Ansar Jelani Date: Mon, 16 Dec 2024 13:49:28 +0000 Subject: [PATCH 12/12] update readme --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 99e1f42..7235d6a 100644 --- a/README.md +++ b/README.md @@ -216,10 +216,10 @@ sudo docker run -e "CLU_KEY=$CLU_KEY" -e "CLU_ENDPOINT=$CLU_ENDPOINT" -d --name sudo docker exec -it clu-custom-connector-0 /bin/bash -poetry run python3 -m hf_integration.main ./credentials/my_org-mtls-credentials.json 0.0.0.0:443 clu "clu_endpoint::$CLU_ENDPOINT,clu_key::$CLU_KEY,delimiter::-,clu_language::en-us,clu_multilingual::True,clu_training_mode::standard,max_batch_size::500" +poetry run python3 -m hf_integration.main ./credentials/my_org-mtls-credentials.json 0.0.0.0:443 clu "clu_endpoint::$CLU_ENDPOINT,clu_key::$CLU_KEY,clu_language::en-us,clu_multilingual::True,clu_training_mode::standard,max_batch_size::500" ### Run the commands while creating the container -sudo docker run -e "CLU_KEY=$CLU_KEY" -e "CLU_ENDPOINT=$CLU_ENDPOINT" -d --name clu-custom-connector-0 -p 443:443 clu-custom-connector poetry run python3 -m hf_integration.main ./credentials/my_org-mtls-credentials.json 0.0.0.0:443 clu "clu_endpoint::$CLU_ENDPOINT,clu_key::$CLU_KEY,delimiter::-,clu_language::en-us,clu_multilingual::True,clu_training_mode::standard,max_batch_size::500" +sudo docker run -e "CLU_KEY=$CLU_KEY" -e "CLU_ENDPOINT=$CLU_ENDPOINT" -d --name clu-custom-connector-0 -p 443:443 clu-custom-connector poetry run poetry run python3 -m hf_integration.main ./credentials/my_org-mtls-credentials.json 0.0.0.0:443 clu "clu_endpoint::$CLU_ENDPOINT,clu_key::$CLU_KEY,clu_language::en-us,clu_multilingual::True,clu_training_mode::standard,max_batch_size::500" ### Free up the port 443 sudo kill -9 $(sudo lsof -t -i :443)