Skip to content

Commit

Permalink
small refactoring of the apache tika lang detect
Browse files Browse the repository at this point in the history
  • Loading branch information
SrdjanStevanetic committed Nov 29, 2023
1 parent 6184f06 commit 1c27732
Showing 1 changed file with 11 additions and 14 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -61,33 +61,30 @@ public List<String> detectLang(List<String> texts, String langHint) throws Langu
for(String text : texts) {
//returns all tika languages sorted by score
tikaLanguages = this.detector.detectAll(text);
if(tikaLanguages.isEmpty()) {
detectedLangs.add(null);
continue;
}
//if langHint is null, return the first detected language (has the highest confidence)
if(StringUtils.isBlank(langHint)) {
detectedLangs.add(tikaLanguages.get(0).getLanguage());
continue;
}

detectedLangs.add(getDetectedLangByHint(tikaLanguages, langHint));
detectedLangs.add(chooseDetectedLang(tikaLanguages, langHint));

}
return detectedLangs;
}

/*
/**
* In case lang hint is not null, check if it myabe exists among the langs with the highest confidence,
* and if so return the langHint as a detected lang, if not return the first one.
* The lang hint param cannot be null.
*/
private String getDetectedLangByHint(List<LanguageResult> tikaLanguages, String langHint) {
private String chooseDetectedLang(List<LanguageResult> tikaLanguages, String langHint) {
if(tikaLanguages.isEmpty()) {
return null;
}
//if langHint is null, return the first detected language (has the highest confidence)
if(StringUtils.isBlank(langHint)) {
return tikaLanguages.get(0).getLanguage();
}

String detectedLang=tikaLanguages.get(0).getLanguage();
if(langHint.equals(detectedLang)) {
return langHint;
}

float confidence=tikaLanguages.get(0).getRawScore();
for(int i=1;i<tikaLanguages.size();i++) {
if(tikaLanguages.get(i).getRawScore()>=confidence) {
Expand Down

0 comments on commit 1c27732

Please sign in to comment.