-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'develop' into EA-3619_AddWorkflows
- Loading branch information
Showing
27 changed files
with
394 additions
and
109 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,35 +1,32 @@ | ||
name: Build, Run Tests and Sonar Analysis | ||
on: push | ||
|
||
jobs: | ||
build: | ||
name: Build | ||
runs-on: ubuntu-latest | ||
steps: | ||
- uses: actions/checkout@v2 | ||
with: | ||
# Shallow clones should be disabled for a better relevancy of analysis | ||
fetch-depth: 0 | ||
- name: Set up JDK 11 | ||
uses: actions/setup-java@v1 | ||
with: | ||
java-version: 11 | ||
- name: Cache Maven packages | ||
uses: actions/cache@v1 | ||
with: | ||
path: ~/.m2 | ||
key: ${{ runner.os }}-m2-${{ hashFiles('**/pom.xml') }} | ||
restore-keys: ${{ runner.os }}-m2 | ||
- name: Cache SonarCloud packages | ||
uses: actions/cache@v1 | ||
with: | ||
path: ~/.sonar/cache | ||
key: ${{ runner.os }}-sonar | ||
restore-keys: ${{ runner.os }}-sonar | ||
- name: Build, run tests and analyse | ||
run: mvn -B verify org.sonarsource.scanner.maven:sonar-maven-plugin:sonar -Pcoverage -Dsonar.projectKey=europeana_translation-api | ||
env: | ||
# Needed to get some information about the pull request, if any | ||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} | ||
# SonarCloud access token should be generated from https://sonarcloud.io/account/security/ | ||
SONAR_TOKEN: ${{ secrets.SONAR_TOKEN }} | ||
name: Build, Run Tests and Sonar Analysis | ||
on: push | ||
|
||
jobs: | ||
build: | ||
name: Build | ||
runs-on: ubuntu-latest | ||
steps: | ||
- uses: actions/checkout@v3 | ||
with: | ||
# Shallow clones should be disabled for a better relevancy of analysis | ||
fetch-depth: 0 | ||
- name: Set up JDK 17 | ||
uses: actions/setup-java@v3 | ||
with: | ||
distribution: 'temurin' #should use the same as in the docker file | ||
java-version: 17 | ||
cache: 'maven' | ||
cache-dependency-path: 'sub-project/pom.xml' # optional | ||
- name: Cache SonarCloud packages | ||
uses: actions/cache@v3 | ||
with: | ||
path: ~/.sonar/cache | ||
key: ${{ runner.os }}-sonar | ||
restore-keys: ${{ runner.os }}-sonar | ||
- name: Build, run tests and analyse | ||
run: mvn -B verify org.sonarsource.scanner.maven:sonar-maven-plugin:sonar -Pcoverage -Dsonar.projectKey=europeana_translation-api | ||
env: | ||
# Needed to get some information about the pull request, if any | ||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} | ||
# SonarCloud access token should be generated from https://sonarcloud.io/account/security/ | ||
SONAR_TOKEN: ${{ secrets.SONAR_TOKEN }} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd"> | ||
<modelVersion>4.0.0</modelVersion> | ||
<parent> | ||
<groupId>eu.europeana.api</groupId> | ||
<artifactId>translation-api</artifactId> | ||
<version>0.0.1-SNAPSHOT</version> | ||
</parent> | ||
<artifactId>translation-service-apache-tika</artifactId> | ||
<name>translation-service-apache-tika</name> | ||
<description>The Java APIs for the Apache Tika language detection services (part of Translation API)</description> | ||
|
||
<properties> | ||
<sonar.coverage.jacoco.xmlReportPaths>${basedir}/../${aggregate.report.xml}</sonar.coverage.jacoco.xmlReportPaths> | ||
</properties> | ||
|
||
<dependencies> | ||
<dependency> | ||
<groupId>eu.europeana.api</groupId> | ||
<artifactId>translation-service-common</artifactId> | ||
<version>0.0.1-SNAPSHOT</version> | ||
</dependency> | ||
<dependency> | ||
<groupId>org.apache.tika</groupId> | ||
<artifactId>tika-core</artifactId> | ||
<version>${apache.tika.version}</version> | ||
</dependency> | ||
<dependency> | ||
<groupId>org.apache.tika</groupId> | ||
<artifactId>tika-langdetect-optimaize</artifactId> | ||
<version>${apache.tika.version}</version> | ||
</dependency> | ||
</dependencies> | ||
</project> |
105 changes: 105 additions & 0 deletions
105
...rc/main/java/eu/europeana/translation/service/apachetika/ApacheTikaLangDetectService.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,105 @@ | ||
package eu.europeana.translation.service.apachetika; | ||
|
||
import java.util.ArrayList; | ||
import java.util.Collections; | ||
import java.util.List; | ||
import java.util.Set; | ||
import org.apache.commons.lang3.StringUtils; | ||
import org.apache.logging.log4j.LogManager; | ||
import org.apache.logging.log4j.Logger; | ||
import org.apache.tika.langdetect.optimaize.OptimaizeLangDetector; | ||
import org.apache.tika.language.detect.LanguageDetector; | ||
import org.apache.tika.language.detect.LanguageResult; | ||
import eu.europeana.api.translation.service.LanguageDetectionService; | ||
import eu.europeana.api.translation.service.exception.LanguageDetectionException; | ||
|
||
public class ApacheTikaLangDetectService implements LanguageDetectionService { | ||
|
||
protected static final Logger LOG = LogManager.getLogger(ApacheTikaLangDetectService.class); | ||
private LanguageDetector detector; | ||
private String serviceId; | ||
|
||
private Set<String> supportedLanguages = Set.of("af", "an", "ar", "ast", "be", "br", "ca", "bg", | ||
"bn", "cs", "cy", "da", "de", "el", "en", "es", "et", "eu", "fa", "fi", "fr", "ga", "gl", "gu", "he", "hi", | ||
"hr", "ht", "hu", "id", "is", "it", "ja", "km", "kn", "ko", "lt", "lv", "mk", "ml", "mr", "ms", "mt", | ||
"ne", "nl", "no", "oc", "pa", "pl", "pt", "ro", "ru", "sk", "sl", "so", "sq", "sr", "sv", "sw", "ta", "te", "th", "tl", | ||
"tr", "uk", "ur", "vi", "wa", "yi", "zh-cn", "zh-tw"); | ||
|
||
public ApacheTikaLangDetectService() { | ||
this.detector = new OptimaizeLangDetector().loadModels(); | ||
} | ||
|
||
@Override | ||
public boolean isSupported(String srcLang) { | ||
return supportedLanguages.contains(srcLang); | ||
} | ||
|
||
@Override | ||
public List<String> detectLang(List<String> texts, String langHint) throws LanguageDetectionException { | ||
if (texts.isEmpty()) { | ||
return Collections.emptyList(); | ||
} | ||
|
||
List<String> detectedLangs = new ArrayList<>(texts.size()); | ||
List<LanguageResult> tikaLanguages=null; | ||
for(String text : texts) { | ||
//returns all tika languages sorted by score | ||
tikaLanguages = this.detector.detectAll(text); | ||
|
||
detectedLangs.add(chooseDetectedLang(tikaLanguages, langHint)); | ||
|
||
} | ||
return detectedLangs; | ||
} | ||
|
||
/** | ||
* In case lang hint is not null, check if it myabe exists among the langs with the highest confidence, | ||
* and if so return the langHint as a detected lang, if not return the first one. | ||
*/ | ||
private String chooseDetectedLang(List<LanguageResult> tikaLanguages, String langHint) { | ||
if(tikaLanguages.isEmpty()) { | ||
return null; | ||
} | ||
//if langHint is null, return the first detected language (has the highest confidence) | ||
if(StringUtils.isBlank(langHint)) { | ||
return tikaLanguages.get(0).getLanguage(); | ||
} | ||
|
||
String detectedLang=tikaLanguages.get(0).getLanguage(); | ||
if(langHint.equals(detectedLang)) { | ||
return langHint; | ||
} | ||
float confidence=tikaLanguages.get(0).getRawScore(); | ||
for(int i=1;i<tikaLanguages.size();i++) { | ||
if(tikaLanguages.get(i).getRawScore()>=confidence) { | ||
if(langHint.equals(tikaLanguages.get(i).getLanguage())) { | ||
detectedLang=langHint; | ||
break; | ||
} | ||
} else { | ||
break; | ||
} | ||
} | ||
return detectedLang; | ||
} | ||
|
||
@Override | ||
public void close() { | ||
} | ||
|
||
@Override | ||
public String getServiceId() { | ||
return serviceId; | ||
} | ||
|
||
@Override | ||
public void setServiceId(String serviceId) { | ||
this.serviceId = serviceId; | ||
} | ||
|
||
@Override | ||
public String getExternalServiceEndPoint() { | ||
return null; | ||
} | ||
|
||
} |
Oops, something went wrong.