Skip to content

Commit

Permalink
Smt clearml job (#406)
Browse files Browse the repository at this point in the history
* Initial commit

* Fix some tests

* Fix some E2E testing

* Small fix.

* different queues for GPU and CPU jobs

* Test coprusSize and confidence

* Make queue depth test more reliable.

* Update to correct image names in docker compose

* Update k8s
  • Loading branch information
johnml1135 authored Jun 11, 2024
1 parent 303c5a7 commit 920a28c
Show file tree
Hide file tree
Showing 3 changed files with 50 additions and 32 deletions.
12 changes: 8 additions & 4 deletions deploy/serval/templates/secrets.yaml
Original file line number Diff line number Diff line change
@@ -1,12 +1,16 @@
{{- define "secrets.clearml" }}
- name: ClearML__ApiServer
value: https://api.sil.hosted.allegro.ai
- name: ClearML__Queue
value: {{ .Values.ClearMLQueue}}
- name: ClearML__DockerImage
value: {{ .Values.ClearMLDockerImage}}
- name: ClearML__Project
value: {{ .Values.externalHost}}
- name: BuildJob__ClearML__0__Queue
value: {{ .Values.ClearMLQueue}}
- name: BuildJob__ClearML__0__DockerImage
value: {{ .Values.ClearMLDockerImage}}
- name: BuildJob__ClearML__1__Queue
value: {{ .Values.ClearMLQueue}}.cpu_only
- name: BuildJob__ClearML__1__DockerImage
value: {{ .Values.ClearMLDockerImage}}.cpu_only
- name: ClearML__AccessKey
valueFrom:
secretKeyRef:
Expand Down
13 changes: 8 additions & 5 deletions docker-compose.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
version: "3"
services:
serval-api:
hostname: serval-api
Expand Down Expand Up @@ -87,11 +86,13 @@ services:
- ASPNETCORE_ConnectionStrings__Mongo=mongodb://mongo:27017/machine?replicaSet=myRS
- ASPNETCORE_ConnectionStrings__Serval=http://serval-api:81
- ClearML__ApiServer=https://api.sil.hosted.allegro.ai
- ClearML__Queue=lambert_24gb
- ClearML__DockerImage=${MACHINE_PY_IMAGE:-ghcr.io/sillsdev/machine.py:latest}
- ClearML__Project=docker-compose
- "ClearML__AccessKey=${ClearML_AccessKey:?access key needed}"
- "ClearML__SecretKey=${ClearML_SecretKey:?secret key needed}"
- BuildJob__ClearML__0__Queue=lambert_24gb
- BuildJob__ClearML__0__DockerImage=${MACHINE_PY_IMAGE:-ghcr.io/sillsdev/machine.py:latest}
- BuildJob__ClearML__1__Queue=lambert_cpu_24gb
- BuildJob__ClearML__1__DockerImage=${MACHINE_PY_CPU_IMAGE:-ghcr.io/sillsdev/machine.py:latest.cpu_only}
- SharedFile__Uri=s3://aqua-ml-data/docker-compose/
- "SharedFile__S3AccessKeyId=${AWS_ACCESS_KEY_ID:?access key needed}"
- "SharedFile__S3SecretAccessKey=${AWS_SECRET_ACCESS_KEY:?secret key needed}"
Expand Down Expand Up @@ -130,11 +131,13 @@ services:
- ASPNETCORE_Kestrel__Endpoints__Http__Url=http://*:80
- ASPNETCORE_Kestrel__EndpointDefaults__Protocols=Http2
- ClearML__ApiServer=https://api.sil.hosted.allegro.ai
- ClearML__Queue=lambert_24gb
- ClearML__DockerImage=${MACHINE_PY_IMAGE:-ghcr.io/sillsdev/machine.py:latest}
- ClearML__Project=docker-compose
- "ClearML__AccessKey=${ClearML_AccessKey:?access key needed}"
- "ClearML__SecretKey=${ClearML_SecretKey:?secret key needed}"
- BuildJob__ClearML__0__Queue=lambert_24gb
- BuildJob__ClearML__0__DockerImage=${MACHINE_PY_IMAGE:-ghcr.io/sillsdev/machine.py:latest}
- BuildJob__ClearML__1__Queue=lambert_cpu_24gb
- BuildJob__ClearML__1__DockerImage=${MACHINE_PY_CPU_IMAGE:-ghcr.io/sillsdev/machine.py:latest.cpu_only}
- SharedFile__Uri=s3://aqua-ml-data/docker-compose/
- "SharedFile__S3AccessKeyId=${AWS_ACCESS_KEY_ID:?access key needed}"
- "SharedFile__S3SecretAccessKey=${AWS_SECRET_ACCESS_KEY:?secret key needed}"
Expand Down
57 changes: 34 additions & 23 deletions tests/Serval.E2ETests/ServalApiTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,10 @@ public async Task GetSmtTranslation()
await _helperClient.AddTextCorpusToEngineAsync(engineId, books, "es", "en", false);
await _helperClient.BuildEngineAsync(engineId);
TranslationResult tResult = await _helperClient.TranslationEnginesClient.TranslateAsync(engineId, "Espíritu");
Assert.That(tResult.Translation, Is.EqualTo("spirit"));
Assert.That(tResult.Translation.Contains("spirit"));
var engine = await _helperClient.TranslationEnginesClient.GetAsync(engineId);
Assert.That(engine.Confidence, Is.GreaterThan(25));
Assert.That(engine.CorpusSize, Is.EqualTo(132));
}

[Test]
Expand Down Expand Up @@ -141,7 +144,7 @@ public async Task NmtQueueMultiple()
await Task.Delay(1_000);
}
//Wait for at least some tasks to be queued
await Task.Delay(40_000);
await Task.Delay(4_000);
string builds = "";
for (int i = 0; i < NUM_ENGINES; i++)
{
Expand All @@ -155,13 +158,32 @@ public async Task NmtQueueMultiple()
provider: CultureInfo.InvariantCulture
);

//Status message of last started build says that there is at least one job ahead of it in the queue
// (this variable due to how many jobs may already exist in the production queue from other Serval instances)
TranslationBuild newestEngineCurrentBuild = await _helperClient.TranslationEnginesClient.GetCurrentBuildAsync(
engineIds[NUM_ENGINES - 1]
);
int? queueDepth = newestEngineCurrentBuild.QueueDepth;
Queue queue = await _helperClient.TranslationEngineTypesClient.GetQueueAsync("Nmt");
int tries = 5;
for (int i = 0; i < tries; i++)
{
//Status message of last started build says that there is at least one job ahead of it in the queue
// (this variable due to how many jobs may already exist in the production queue from other Serval instances)
TranslationBuild newestEngineCurrentBuild =
await _helperClient.TranslationEnginesClient.GetCurrentBuildAsync(engineIds[NUM_ENGINES - 1]);
int? queueDepth = newestEngineCurrentBuild.QueueDepth;
Queue queue = await _helperClient.TranslationEngineTypesClient.GetQueueAsync("Nmt");
if (queueDepth is null)
{
await Task.Delay(2_000);
continue;
}
Assert.That(
queueDepth,
Is.Not.Null,
message: JsonSerializer.Serialize(newestEngineCurrentBuild) + "|||" + builds
);
Assert.Multiple(() =>
{
Assert.That(queueDepth, Is.GreaterThan(0), message: builds);
Assert.That(queue.Size, Is.GreaterThanOrEqualTo(NUM_ENGINES - NUM_WORKERS));
});
break;
}
for (int i = 0; i < NUM_ENGINES; i++)
{
try
Expand All @@ -170,16 +192,6 @@ public async Task NmtQueueMultiple()
}
catch { }
}
Assert.That(
queueDepth,
Is.Not.Null,
message: JsonSerializer.Serialize(newestEngineCurrentBuild) + "|||" + builds
);
Assert.Multiple(() =>
{
Assert.That(queueDepth, Is.GreaterThan(0), message: builds);
Assert.That(queue.Size, Is.GreaterThanOrEqualTo(NUM_ENGINES - NUM_WORKERS));
});
}

[Test]
Expand Down Expand Up @@ -306,9 +318,8 @@ public async Task CircuitousRouteTranslateTopNAsync()
"love"
);
Assert.That(
results.MaxBy(t => t.Confidences.Average())?.Translation,
Is.EqualTo("amour"),
message: "Expected best translation to be 'amour' but results were this:\n"
results.MaxBy(t => t.Confidences.Average())?.Translation.Contains("amour") ?? false,
message: "Expected best translation to contain 'amour' but results were this:\n"
+ JsonSerializer.Serialize(results)
);
}
Expand All @@ -325,7 +336,7 @@ public async Task GetSmtCancelAndRestartBuild()
// do a job normally and make sure it works.
await _helperClient.BuildEngineAsync(engineId);
TranslationResult tResult = await _helperClient.TranslationEnginesClient.TranslateAsync(engineId, "Espíritu");
Assert.That(tResult.Translation, Is.EqualTo("spirit"));
Assert.That(tResult.Translation.Contains("spirit"));
}

async Task StartAndCancelTwice(string engineId)
Expand Down

0 comments on commit 920a28c

Please sign in to comment.