From 1df98413c2f0acccb9dc72cad49a7eb71574f56b Mon Sep 17 00:00:00 2001 From: Antoni Casas Date: Sun, 1 Dec 2019 17:28:41 +0100 Subject: [PATCH 1/8] First steps to adding improved semantic extraction --- libs/linguistic/rake/1.0/LICENSE.txt | 21 + libs/linguistic/rake/1.0/rake-1.0.jar | Bin 0 -> 5716 bytes libs/linguistic/rake/1.0/rake-1.0.pom.xml | 10 + pom.xml | 20 + .../controller/Controller.java | 3 + .../domain/Requirement.java | 27 + .../functionalities/NLPAnalyser.java | 375 +----- .../functionalities/OntologyHandler.java | 91 +- .../functionalities/RAKEKeywordExtractor.java | 137 +++ .../TFIDFKeywordExtractor.java | 188 +++ .../functionalities/TextPreprocessing.java | 62 + .../service/DependencyService.java | 19 +- src/main/resources/ExcludedWords.txt | 1075 +++++++++++++++++ .../wordnet/wordnet_properties.xml | 112 +- .../resources.xml | 2 +- .../gessi/dependency_detection/AppTest.java | 2 +- 16 files changed, 1711 insertions(+), 433 deletions(-) create mode 100644 libs/linguistic/rake/1.0/LICENSE.txt create mode 100644 libs/linguistic/rake/1.0/rake-1.0.jar create mode 100644 libs/linguistic/rake/1.0/rake-1.0.pom.xml create mode 100644 src/main/java/com/gessi/dependency_detection/domain/Requirement.java create mode 100644 src/main/java/com/gessi/dependency_detection/functionalities/RAKEKeywordExtractor.java create mode 100644 src/main/java/com/gessi/dependency_detection/functionalities/TFIDFKeywordExtractor.java create mode 100644 src/main/java/com/gessi/dependency_detection/functionalities/TextPreprocessing.java create mode 100644 src/main/resources/ExcludedWords.txt diff --git a/libs/linguistic/rake/1.0/LICENSE.txt b/libs/linguistic/rake/1.0/LICENSE.txt new file mode 100644 index 0000000..75dd38b --- /dev/null +++ b/libs/linguistic/rake/1.0/LICENSE.txt @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2018 Linguistic + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. \ No newline at end of file diff --git a/libs/linguistic/rake/1.0/rake-1.0.jar b/libs/linguistic/rake/1.0/rake-1.0.jar new file mode 100644 index 0000000000000000000000000000000000000000..9c1f64bfcfbd2ce20c78df271439573caa300028 GIT binary patch literal 5716 zcmb7|2T&Ai)`p29NDh)5kSs%zEJ<=0$w5GJ&Orun7&3@t5XqABpd^Ws5r>>J3?N9B zIAjJH<%g}^y?5_lcdPbvb=Rr-zV|&}cUPUN^JuAJ<4|JUy?Yk}14EYJ=JD^qzsb}U z^<;TK8cKZF7+R`$w|g+K{<%TxZ-tb97OKl?fRq$<^?22lPSl46)l_+S$B9&VfFnbr zwK|Wd1ix>rZeabxye0_ifc{2GdvpK2InK?hwY%&8bRd#{YFr%M?7ba5y&SFoC;9FB z+?M>49Bk=q`#*Uw{EdgsjS;W4i>0S$n<2gripfr^pg{y=TZ1K!4JmYq$vOc8YtS0nOFk!?$fcvdy}MfPf4QK<R@>&q)fWSm*RN%t;>NX!AoLH8usr{^sf zCi+338QdQ0aMh70clz)iCh8~#(*eaMh6ZDt`RKbzb3|;trs>&QKMV~YbxtSD!_>Uu zXUm-oG)Wny{?*8i2rMS>JHjQJ74{#0o3_4Cb>|ad$7XLD8dbx38O_KyxWxrVIbn3f z8x&_OC%|WgBw5pfCf_bk94=tifm$wBAy{*fRfDt3(}#Y?8y6dcjD0KgvaR7;8+)0U zE}e`{D3(jKHBk^V^|es&qxNZ;hrh)mJ35!|970fjV*V1597nC;L4*bL*Q^mpmFL&o z5kc?Z9Rn-N>xW3X-}3Mv0#uTn6RUl^Ar?^p$B?dFi-`l{(0tUCN}7x*dnSW)a>Lqt zNG77(7KY=$K>k7DRf~+RCn_%(#sT1fxSBKD60P^d8=qsI_~`Vj_71JfPHOZ)Amh%) zEHkv=>=Lz*kD)#x35C*W>Z}V8L<6*e?_K{ySjJk@isfL#>J^Wm8v-i^@KvF?t;Hsy z?)kG;o1$n|9&%EB**frkr69kvWTpyC<)A;Fy%Y*E=a!n$IDYp)jH3G0f}~?Ta!ZEJ zoZZ=Y(@^&A^m$2-CqlMfE%Z%sU=CpPijH?{tl{#JJ1k+Rvz%4};U3RrMs!`g(t&$Q z#wFoUT%!-P8z1o=K6wvU9(XgHR6jx*x)UqxHg7irW^&_Y$@|=j!;@O;`q3^u>3$)v zFQ;&lk73Va!5p=hh1J}J7j=bPDhtT^k(?1{UU?I@U?oH)- z)lt^zO!h8Eml&V!XIoPY744S~vVLm1+;19r_G$DOnszefr^`8b0Lq|!F4XOsLRdyc zg#g;%^F5m1v-K0Lbwns}^%hAz?Rk{uCH{fCqM4jU&;0<%flmF_Q5Adcij*lxMo9Pe zLD)6n*LPcy;U1Sks>wYCcFnySZvk5SQaT5cji3^g=KY^X-;FKXYz47jZ)X8&&so=x z4A@A<&Js0xpSBwJ?xM_dHusPF4D`!KbNZc7mGRebdavps%YJx0T6?rXXY=u^^P&c1 z|B5<5kQa*HLOEH|niwF}_%QV8Z-?QlAhot0;rBZFmw-r5vuI z93SqTU%MfbHK|x%I?f;byOpIFLX<M}Yt`XVdadcoIU4w!RblU8-`|~ay{hw8jHr=E zzYr6ek`$t3pfeIp-ktm8QAQ^UkP;P2rlpI1ImJ($6MQdT(m{z~i3(~VZ?o$c5pJ~5 z1kt1RR-oEaY-6YM8h>>Ehf0jEX;_z>DK~ixo5j|#&xj@`fa9eOb+}J<$% zUF^qXp_zUstKUoVJu9BE2hYo&l5ws0Ovzw~+~6D_ZP{G|6)r{$)1RSV^@jrYA?0A` z_kjaV{^6AZx#uyaY#-8LEQ5huh0zcL_9|ZgH3ohpd23_SUN~CDA?q6$xecD?^K8@n zHb$Ln_`)lqo}#W`BKAa{NzWzbwT=M_Ga5u0Y&iFTHz>og(9TXqSS_$Rp*Kl~uh_}- zz2w=@kp~x5ha)6=nb8I^TykRC<;k3FF^uF+9TRk#e?3w^4s03&$=hP{+>d%065HQ^SYS*=}`jSxxIu%%pPPdf@+{c2&rCVZwiPgJxn%om2MnvD>7Gr<@e%+z9t(>2KYrK{Sy6i?zV*>SKP~M=j*W#o zpNIRCG#!D3i^inN#PInf2Ho7Gn)#1bJBr0ufML-TvI8y-IV|mw+*tI;dc9A)^v>58 zW(nnZ!wniQ8p_}FKuy3M%{a6@vrSrJ-*lKNqe^+3yIzH))cUaLz$W%412*T58v9Sa zJ`l0i1GRsj+skBsrDJ6-9w4ARZq7A=Z4m4^wFoCkEJ}H?ua(LfM$bPG1Z-_nSKb(g z!cQQ6LpI=qih5Ow7Mz*IL}{!+cZ~?Dxfnl8y~DTEw0*glXQwCkyCs-=uNqyy!~iP; zYi$V%Y{{vJQj!i&>)T^TOZvo4p3&rha7ZpXV&_#BSWiKE zjLiZkfHcXSIJE-y9YKGC&az3(6=Jb9qD{6b<$7m_huYTgP*50QoIG5PILVOmjNS~6 zjPxxowya*s4nKl&H-%;kSs+#<*IZfMp_qe!O3uaK-K*b9pe>{0qj%7(yjGl&bDEkL za=)eB`Il1*?hIbegt2v%QI&b%%~(*+yeq+ln?NaDQ5ejq$0Axi?9UL99AhD6bFs5y zQL@kIl!CniA3SUhY_ItGE6Xt%%7Eu_Hzhf#%kO16-353Hvl7Z_O{t91mxT`UhZPoC zQyxBC8nh=e>*i0d`z8Ea%&WJlJwa31E%r3Nj;6S~=`+bZ;d7|)-n7Ly#)y!`oqTjg z_)qlCg;rq+d|7dp@T2w$7@tTrts`VX<=OLAOEJe#-u8aI2_t%aHLHzUU@dFLQ*sOm zO7eRE@>x7F)iJpxjSbu>GFFizp1w94XE>06uFpP>gZzwcX*aulK!c~~jelD8@L_lz z@#Y)zjkz6%6m8)s;p`AC3k0nC38fD~oYJ(=jG4mNVPD&hJVi#MzzLhY?T==@yg>3{ zJ|^x8Z>tXd4PQQ51>Jn;P^Q6iua4NegE^F4BPo&33YkyazZ-rK+NdqsUJQS4sfs9U5a2v@5gj3xm~6(jSLQrdcY=@~}(dwNgshZ(n`dOCZXU%@v$^qAli`S<7eDw+!+U*O`yrghIJ1$TNCl zq8O^A-skjWCmsw8vB3p10e&=h9y|Dj>m3w@sNH|NZ_!mR?Z z2DYG+z4*L8sFBAUzwbggawIA6RQM!YR#*cidtR`)`X}9&+R=BVFnI7W3Ts*+WGB?P zAB3{=iA9}6O{QLZpYVpxgN-$I`eCZG5EinbFQTMc0TSViI;JR(-$x6xAdp9V>J<3f z2+XQXvZ@N=%Ow%OWZvgT8$qPmH%m6{)F6N_oDzn~LP|u}xtr~4YwukuUS!@wz(Vu! zAaaKL{4=dE@T)Omc7i ze8`%;53w2Z&s;0M#y*v&xn!N2M^87ESmoksf&iJWUPW#%^K>L;rml$0)bN5WY0V&nDxo|B1_@*l?ZCg_QZU%|rK>rjPGu=6NnCrKp zRmRR<#NxamzjSa>qk-L5LG>%Gw&8yDmymQHR*&uKiJ-fyY9Grc-YH~ zE*Ed3M^tA${SF&-AS1FO~dNRiBD zL7O>7K^pTNU(8pl+DUHb{^f&*o9VcckBQ!h7b{emA zzMi&!%C}b`c&K%z%QQ9*>~|_(nb7xrK`A&7XV`AKW5JXd)rep@@hhn%T6&F^PLzEb zj41nQ1|rf|%zB_5R>O?_b}@zN2LbHsy_x_9Lb&=PsIA7cIQ?WCoJolgNMTic4 z&f1ZwuPcG&F4&-{`XvPsdH$!@!sQ%W%2Oa{7Frx@u2p65zM#LpaLvZ4iHlmYYL>EH zgbFcXp|z9D1DoHLepra>K02D6C!J$LkVk8j@_?-;3t2g0dH8<8Q%;^s4x58^Y%gSj zJ`|;~$Y|vKbUW#vrdYIy6Anrj4iZwo(#~w#OKBXc@?Xzr;9L7WU>d48?lZm_=JiAx z7uuwWxdYGGy^t*dEKaR#`8{;LOKi_MeBXE?EYA@Te1}@d%Ha((lV_rq63jgo966_o zV1t*E!udcp9ft2K_T0C2UXnE`Zdh1?uYl!SWll3yOGQm&RTgD-Z)pP&RTRkSnXF~Z zVg2l0-@r(d|G_z%WRv8h|f%6cr(;{jpH|Ga3u-EfBWFJzK&9v-0 z4t$}@^F~gMRKl!a<(!dHND|3A!s(Xe;Z*~|Ic%5|dxrh1>jlX91&w7urqo`aNmpv) zS7mnHP!dNS59|~tS4P^5`_Of1j4frFG&S1CsnV{JlkNBhZBphA7I!E*iN z=QiDaOjOi5yW9DUPo?UrQLAVQ_#x59Vv^*fC#^*eb@j({pEeJ+`+t5?i@4t--`y(S zbCtGj;i^)*e163(Wn7Qz2`HO<3Rycg?EZ*;HKRgHEo zbS7k+^k#O4S!i9?C^^qK{36Wt66=VXf^20^L%j{sC}yklICkx&u#U&918J5A_l2IG z_+*XqN{MlDrLEJ}?3#F78vjkh@RKjYtU@~^T# zlf+wi_eY4`r2m%v2~Gah;a2gl!nefnk4U{qZ(-nnQ{3{#KVsk}{R=+)Z{q(vFh1u0 z6aRxf{?*~n9r~6v{t@bbLRB`XzF5}%?)i;IW{&D*M E1K&~t9RL6T literal 0 HcmV?d00001 diff --git a/libs/linguistic/rake/1.0/rake-1.0.pom.xml b/libs/linguistic/rake/1.0/rake-1.0.pom.xml new file mode 100644 index 0000000..66a3a9a --- /dev/null +++ b/libs/linguistic/rake/1.0/rake-1.0.pom.xml @@ -0,0 +1,10 @@ + + + 4.0.0 + linguistic + rake + 1.0 + POM was created from install:install-file + \ No newline at end of file diff --git a/pom.xml b/pom.xml index ed9c673..29a00fd 100644 --- a/pom.xml +++ b/pom.xml @@ -53,6 +53,26 @@ spring-boot-starter-log4j2 + + + org.apache.lucene + lucene-analyzers-common + 7.7.1 + + + org.apache.lucene + lucene-core + 7.7.1 + + + linguistic + rake + 1.0 + system + ${project.basedir}/libs/linguistic/rake/1.0/rake-1.0.jar + + + diff --git a/src/main/java/com/gessi/dependency_detection/controller/Controller.java b/src/main/java/com/gessi/dependency_detection/controller/Controller.java index ef084d6..d080941 100644 --- a/src/main/java/com/gessi/dependency_detection/controller/Controller.java +++ b/src/main/java/com/gessi/dependency_detection/controller/Controller.java @@ -2,6 +2,7 @@ import java.io.IOException; import java.util.LinkedHashMap; +import java.util.concurrent.ExecutionException; import javax.validation.Valid; import javax.validation.constraints.NotNull; @@ -113,6 +114,8 @@ public ResponseEntity uploadJSONFile( return new ResponseEntity<>(createException(e.toString(),"NLP Error"), HttpStatus.INTERNAL_SERVER_ERROR); } catch (SimilarityException | LexicalSemanticResourceException e) { return new ResponseEntity<>(createException(e.toString(),"Similarity Error"), HttpStatus.INTERNAL_SERVER_ERROR); + } catch (ExecutionException e) { + e.printStackTrace(); } return new ResponseEntity<>(onjN, HttpStatus.OK); } diff --git a/src/main/java/com/gessi/dependency_detection/domain/Requirement.java b/src/main/java/com/gessi/dependency_detection/domain/Requirement.java new file mode 100644 index 0000000..68d1f8f --- /dev/null +++ b/src/main/java/com/gessi/dependency_detection/domain/Requirement.java @@ -0,0 +1,27 @@ +package com.gessi.dependency_detection.domain; + +public class Requirement { + String description; + String id; + + public Requirement(String s, String s1) { + description=s1; + id=s; + } + + public String getDescription() { + return description; + } + + public void setDescription(String description) { + this.description = description; + } + + public String getId() { + return id; + } + + public void setId(String id) { + this.id = id; + } +} diff --git a/src/main/java/com/gessi/dependency_detection/functionalities/NLPAnalyser.java b/src/main/java/com/gessi/dependency_detection/functionalities/NLPAnalyser.java index f64e6fc..47a21e7 100644 --- a/src/main/java/com/gessi/dependency_detection/functionalities/NLPAnalyser.java +++ b/src/main/java/com/gessi/dependency_detection/functionalities/NLPAnalyser.java @@ -1,22 +1,13 @@ package com.gessi.dependency_detection.functionalities; -import java.io.IOException; -import java.io.InputStream; import java.io.BufferedReader; +import java.io.IOException; import java.io.FileReader; -import java.util.ArrayList; -import java.util.Collection; -import java.util.List; +import java.util.*; +import java.util.concurrent.ExecutionException; import com.gessi.dependency_detection.util.Control; import dkpro.similarity.algorithms.api.SimilarityException; -import org.apache.uima.UIMAException; -import org.apache.uima.analysis_engine.AnalysisEngine; -import org.apache.uima.analysis_engine.AnalysisEngineDescription; -import org.apache.uima.fit.testing.factory.TokenBuilder; -import org.apache.uima.fit.util.JCasUtil; -import org.apache.uima.jcas.JCas; -import org.apache.uima.resource.ResourceInitializationException; import de.tudarmstadt.ukp.dkpro.lexsemresource.LexicalSemanticResource; import de.tudarmstadt.ukp.dkpro.lexsemresource.core.ResourceFactory; @@ -25,39 +16,11 @@ import dkpro.similarity.algorithms.lsr.LexSemResourceComparator; import dkpro.similarity.algorithms.lsr.path.WuPalmerComparator; -import org.springframework.core.io.ClassPathResource; - -import com.gessi.dependency_detection.components.Node; - -import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; -import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngine; - -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; -import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.*; - -import de.tudarmstadt.ukp.dkpro.core.clearnlp.ClearNlpLemmatizer; -import de.tudarmstadt.ukp.dkpro.core.clearnlp.ClearNlpParser; -import de.tudarmstadt.ukp.dkpro.core.clearnlp.ClearNlpPosTagger; -import opennlp.tools.sentdetect.SentenceDetectorME; -import opennlp.tools.sentdetect.SentenceModel; -import opennlp.tools.tokenize.Tokenizer; -import opennlp.tools.tokenize.TokenizerME; -import opennlp.tools.tokenize.TokenizerModel; public class NLPAnalyser { - private static ClassPathResource sentPath = new ClassPathResource("en-sent.bin"); - private static ClassPathResource tokenPath = new ClassPathResource("en-token.bin"); - - private SentenceDetectorME sentenceDetector = null; private LexSemResourceComparator comparatorWN = null; - private AnalysisEngineDescription tagger = null; - private AnalysisEngineDescription lemma = null; - private AnalysisEngine parserEngine = null; - private AnalysisEngine lemmaEngine = null; private LexicalSemanticResource wordnet; /** @@ -67,6 +30,7 @@ public class NLPAnalyser { public NLPAnalyser() { super(); try { + System.out.println("Loading"); wordnet = ResourceFactory.getInstance().get("wordnet", "en"); wordnet.setIsCaseSensitive(false); } catch (ResourceLoaderException e) { @@ -75,305 +39,75 @@ public NLPAnalyser() { } } - /** - * The approach of dependency detection - * @param requirement + * Semantic similarity engine (DKPRO & WordNet) + * @param term1 + * @param term2 * @return - * @throws IOException - * @throws ResourceInitializationException - * @throws UIMAException + * @throws SimilarityException + * @throws LexicalSemanticResourceException */ - public List requirementAnalysis(String requirement) - throws IOException, ResourceInitializationException, UIMAException { - - ArrayList synResult = new ArrayList<>(); - - String[] listReq = requirement.replaceAll("\\.(\\s)?(\\((\\w|\\W)+\\))$", ".").split("\n"); - for (String partReq : listReq) { - // Sentence Boundary Disambiguation - String[] splitString = sentenceDetection(partReq); + public double semanticSimilarity(String term1, String term2) + throws SimilarityException, LexicalSemanticResourceException { - for (String sentence : splitString) { - if (!sentence.equals("")) { - // Noisy text cleaning - String clearedSnetence = clearSentence(sentence); - try { - if (!clearedSnetence.equals("")) { - // tokenization - String[] tokens = tokenization(clearedSnetence); - String reqSent = ""; - for (int i = 0; i < tokens.length; i++) { - if (tokens[i].matches("(\\w\\.)$") && (i + 1) == tokens.length) { - reqSent = reqSent.concat(" " + tokens[i].substring(0, tokens[i].length() - 1) + " ."); - } else { - reqSent = reqSent.concat(" " + tokens[i]); - } - } + if (comparatorWN == null) + comparatorWN = new WuPalmerComparator(wordnet, wordnet.getRoot()); + return comparatorWN.getSimilarity(term1, term2); - // PoSTagg & Parser & Lemma - Node root = dependencyParser(reqSent); - // Information Extraction (IE) - DependencyTreeIE ie = new DependencyTreeIE(root); - synResult.addAll(ie.applyIE()); - } - } catch (NullPointerException e) { - Control.getInstance().showErrorMessage("[ERROR] The grammar of the sentence is not correct!"); - } - } - } - } - return synResult; } - /** - * Noisy text cleaning - * Rule-based method - * It uses regexp to replace and remove noisy text - * @param sentence - * @return - */ - private String clearSentence(String sentence) { - // (\(LC\))$ - sentence = sentence.replaceAll("^\t", ""); - sentence = sentence.replaceAll("\\.(\\s)?(\\((\\w|\\W)+\\))$", "."); - sentence = sentence.replaceAll("^(\\s)?(\\((\\w|\\W)+\\))$", "."); - // ^(\(?[a-zA-Z](\)|\.)(\t)?)|^(\(?\d+((\.\d+)+)?(\)|\.)?(\t)?)|^[\u2022,\u2023,\u25E6,\u2043,\u2219,\W]+(\s|\t)? - sentence = sentence.replaceAll( - "^(\\(?[a-zA-Z](\\)|\\.)(\\t)?)|^(\\(?\\d+((\\.\\d+)+)?(\\)|\\.)?(\\t)?)|^[\\u2022,\\u2023,\\u25E6,\\u2043,\\u2219,\\W]+(\\s|\\t)?", - ""); - // - sentence = sentence.replaceAll("^(\\(?(ix|iv|v?i{1,3}|x?i{1,3})\\)?)(?![a-zA-Z]).?(\\t|\\s)?", ""); - // |^(RBC)\s\d+(\.)?\s? - sentence = sentence.replaceAll( - "^(NOTE)\\s\\d+(\\.)?\\s?|^(RBC)\\s\\d+(\\.)?\\s?|^(OBU)\\s\\d+(\\.)?\\s?|^(EA)\\s\\d+(\\.)?\\s?|^(CE)\\s\\d+(\\.)?\\s?|^(GEN)\\s\\d+(\\.)?\\s?|^(LED)\\s\\d+(\\.)?\\s?", - ""); - // \\\/ - sentence = sentence.replaceAll("\\/", " / "); - - // \w+('s)(?!\w) - sentence = sentence.replaceAll("('s)(?!\\w)", " 's"); - // parentheses, quotation marks - sentence = sentence.replaceAll("\\(", " ( ").replaceAll("\\)", " ) ").replaceAll("[\"“”]", " \" "); - // \.(\s) -> used for separate the end point if the sentence detection don't - // split the phrase correctly - sentence = sentence.replaceAll("\\.(\\s)", " . "); - sentence = sentence.replaceAll("\\s+", " "); - - // Check the endpoint of the sentence - if (sentence.length() > 1) { - if (sentence.substring(sentence.length() - 1).equals(";") - || sentence.substring(sentence.length() - 1).equals(",") - || sentence.substring(sentence.length() - 1).equals(":")) { - sentence = sentence.substring(0, sentence.length() - 1); - } - if (!sentence.substring(sentence.length() - 1).equals(".")) { - sentence = sentence + "."; - } + public Map> prepareRequirements(Map requirements, int maxSize) throws InterruptedException, ExecutionException, IOException { + List recs=new ArrayList<>(); + for (String s:requirements.keySet()) { + recs.add(new com.gessi.dependency_detection.domain.Requirement(s,requirements.get(s))); } - return sentence; - } - - /** - * Tokenization (OpenNLP) - * @param requirmenet - * @return - * @throws IOException - */ - public String[] tokenization(String requirmenet) throws IOException { - InputStream inputFile = null; - inputFile = tokenPath.getInputStream(); - TokenizerModel model = new TokenizerModel(inputFile); - Tokenizer tokenizer = new TokenizerME(model); - return tokenizer.tokenize(requirmenet); - } - - /** - * Sentence Boundary Disambiguation (SBD) (openNLP) - * @param sentence - * @return - * @throws IOException - */ - public String[] sentenceDetection(String sentence) throws IOException { - String[] sentences = null; - // Loading sentence detector model - InputStream inputStream = null; - if (sentenceDetector == null) { - inputStream = sentPath.getInputStream(); + Map keywords; + Map>> wordOrder; + if (requirements.keySet().size()>100) { + TFIDFKeywordExtractor extractor=new TFIDFKeywordExtractor(); + keywords=extractor.computeTFIDF(recs); + wordOrder=extractor.getWordOrder(); } - try { - if (sentenceDetector == null) { - SentenceModel model = new SentenceModel(inputStream); - - // Instantiating the SentenceDetectorME class - sentenceDetector = new SentenceDetectorME(model); - } - // Detecting the sentence - sentences = sentenceDetector.sentDetect(sentence); - } catch (IOException e) { - Control.getInstance().showErrorMessage(e.getMessage()); - } finally { - if (inputStream != null) { - try { - inputStream.close(); - } catch (IOException e) { - Control.getInstance().showErrorMessage(e.getMessage()); - } - } + else { + RAKEKeywordExtractor extractor=new RAKEKeywordExtractor(); + keywords=extractor.computeRake(recs); + wordOrder=extractor.getWordOrder(); } - return sentences; - } - - /** - * Used to set the requirement text into the pipeline - * @param aEngine - * @param aLanguage - * @param aText - * @return - * @throws UIMAException - */ - public static JCas runParser(AnalysisEngine aEngine, String aLanguage, String aText) throws UIMAException { - - JCas jcas = aEngine.newJCas(); - - jcas.setDocumentLanguage(aLanguage); - - TokenBuilder tb = new TokenBuilder<>(Token.class, Sentence.class); - tb.buildTokens(jcas, aText); - - aEngine.process(jcas); - - return jcas; + return getNgrams(keywords,wordOrder,maxSize); } - /** - * Lemmatization engine (clearNLP) - * @param term - * @return - * @throws UIMAException - */ - public String lemmatization(String term) throws UIMAException { - if (lemmaEngine == null && tagger == null && lemma == null) { - - tagger = createEngineDescription(ClearNlpPosTagger.class); - lemma = createEngineDescription(ClearNlpLemmatizer.class); - - lemmaEngine = createEngine(createEngineDescription(tagger, lemma)); - } - JCas jcas = runParser(lemmaEngine, "en", term); - Collection lemmas = JCasUtil.select(jcas, Lemma.class); - String ret = ""; - String[] terms = term.split(" "); - int i = 0; - if (!lemmas.isEmpty()) { - for (Lemma l : lemmas) { - if (!l.getValue().matches("\\d+")) { - if (!ret.equals("")) - ret = ret.concat(" " + l.getValue()); - else - ret = l.getValue(); - } else { - if (!ret.equals("")) - ret = ret.concat(" " + terms[i]); - else - ret = terms[i]; + private Map> getNgrams(Map keywords, Map>> wordOrder, int maxSize) { + Map> result=new HashMap<>(); + for (String s:keywords.keySet()) { + TreeMap orderedKeywords=new TreeMap<>(); + for (String k:keywords.get(s).split(" ")) { + if (wordOrder.get(s).containsKey(k)) { + for (Integer i : wordOrder.get(s).get(k)) { + orderedKeywords.put(i, k); + } } - i++; } - } - return ret; - } - - /** - * Dependency parser engine (clearNLP) - * This function generates a dependency tree from the dependency parser results. - * @param aText - * @return - * @throws ResourceInitializationException - * @throws UIMAException - * @throws IOException - */ - public Node dependencyParser(String aText) throws ResourceInitializationException, UIMAException, IOException { - if (parserEngine == null) { - parserEngine = createEngine(createEngineDescription(createEngineDescription(tagger, lemma), - createEngineDescription(ClearNlpParser.class))); - } - JCas jcas = runParser(parserEngine, "en", aText); - Node root = null; - ArrayList dependencyTree = new ArrayList<>(); - Collection deps = JCasUtil.select(jcas, Dependency.class); - if (!deps.isEmpty()) { - for (Dependency d : deps) { - Node node = new Node(d.getDependent().getBegin(), d.getGovernor().getBegin(), - d.getDependent().getPosValue(), d.getDependencyType(), d.getDependent().getCoveredText(), - d.getDependent().getLemmaValue(), d); - dependencyTree.add(node); - } - - root = fillTreeLinks(dependencyTree); - } - return root; - } - - /** - * Update the tree information - * @param tree - * @return - */ - private Node fillTreeLinks(ArrayList tree) { - Node root = null; - for (Node n : tree) { - if (n.getParentId() > n.getId()) { - int pIdx = findParent(tree, n.getParentId(), tree.indexOf(n) + 1, n.getParentId() > n.getId()); - tree.get(pIdx).addSonNodes(n); - - } else if (n.getParentId() < n.getId()) { - int pIdx = findParent(tree, n.getParentId(), tree.indexOf(n) - 1, n.getParentId() > n.getId()); - tree.get(pIdx).addSonNodes(n); - } else { - root = n; + List ordered=new ArrayList<>(); + for (String o:orderedKeywords.values()) { + ordered.add(o); } + List ngrams=ngrams(ordered,maxSize); + result.put(s,ngrams); } - return root; + return result; } - /** - * Find the parent of the node from the dependncy parser results - * @param tree - * @param parentId - * @param idx - * @param next - * @return - */ - private int findParent(ArrayList tree, int parentId, int idx, boolean next) { - boolean find = false; - while (!find) { - if (tree.get(idx).getId() == parentId) { - find = true; - } else if (next) { - idx++; - } else { - idx--; + private List ngrams(List ordered, int maxSize) { + List result=new ArrayList<>(); + for (int i=0;i readFile(String path) { Control.getInstance().showErrorMessage(e.getMessage()); } return fileLines; - } -} \ No newline at end of file + }} \ No newline at end of file diff --git a/src/main/java/com/gessi/dependency_detection/functionalities/OntologyHandler.java b/src/main/java/com/gessi/dependency_detection/functionalities/OntologyHandler.java index 411bf4b..88c57db 100644 --- a/src/main/java/com/gessi/dependency_detection/functionalities/OntologyHandler.java +++ b/src/main/java/com/gessi/dependency_detection/functionalities/OntologyHandler.java @@ -3,12 +3,12 @@ import java.io.IOException; import java.util.*; +import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.S; import org.apache.log4j.varia.NullAppender; import org.apache.uima.UIMAException; import com.gessi.dependency_detection.components.Dependency; import com.gessi.dependency_detection.components.DependencyType; -import com.gessi.dependency_detection.components.Node; import com.gessi.dependency_detection.components.Status; import com.hp.hpl.jena.ontology.DatatypeProperty; import com.hp.hpl.jena.ontology.Individual; @@ -60,11 +60,13 @@ public void loadOnt(String source, String path) throws IOException { * @param analizer * @throws IOException * @throws UIMAException + * @return */ - public void searchClasses(NLPAnalyser analizer) throws IOException, UIMAException { + public int searchClasses(NLPAnalyser analizer) throws IOException, UIMAException { ontClasses = new ArrayList<>(); classesWords = new ArrayList<>(); classesLemmas = new ArrayList<>(); + int max=1; ExtendedIterator rootClasses = this.model.listClasses(); while (rootClasses.hasNext()) { OntClass thisClass = (OntClass) rootClasses.next(); @@ -82,10 +84,10 @@ public void searchClasses(NLPAnalyser analizer) throws IOException, UIMAExceptio ontTerm = words[i]; } } - String[] lemmas = extractLemmas(ontTerm, analizer); - + String[] lemmas = extractLemmas(ontTerm); ontClasses.add(thisClass); classesWords.add(words); + if (words.length>max) max=words.length; classesLemmas.add(lemmas); for (int i = 0; i < lemmas.length; i++) { synonyms.put(lemmas[i], new ArrayList<>()); @@ -93,6 +95,13 @@ public void searchClasses(NLPAnalyser analizer) throws IOException, UIMAExceptio } } } + return max; + } + + private String[] extractLemmas(String ontTerm) throws IOException { + TextPreprocessing textPreprocessing=new TextPreprocessing(); + String l=textPreprocessing.text_preprocess(ontTerm); + return l.split(" "); } /** @@ -188,7 +197,7 @@ private boolean isSameNgram(Stack ngramTerm, Stack ngramLemma, S } find = false; int j = 0; - while (j < words.length && !find) { + while (j< lemmas.length && j < words.length && !find) { if (!idxOntLemmaAnalized.contains(j) && isSameTerm(ngramTerm.get(i), ngramLemma.get(i), words[j], lemmas[j])) { find = true; @@ -200,7 +209,7 @@ && isSameTerm(ngramTerm.get(i), ngramLemma.get(i), words[j], lemmas[j])) { } // of it is not detected, check the synonymy - if (!find && syny) { + /*if (!find && syny) { for (int i = 0; i < ngramLemma.size(); i++) { if (!idxReqLemmaAnalized.contains(i)) { @@ -219,7 +228,7 @@ && isSynonym(ngramLemma.get(i), lemmas[j], analizer, thr)) { } } else find = true; } - } + }*/ return find; } @@ -266,24 +275,39 @@ private boolean findPotentialNgram(int idx, int level, int n, String[] termsNode * ontology. * * @param node - * @param words * @param lemmas - * @param analizer * @param syny - * @param thr * @return * @throws SimilarityException * @throws LexicalSemanticResourceException */ - private boolean extractNGram(Node node, String[] words, String[] lemmas, NLPAnalyser analizer, boolean syny, - double thr) throws SimilarityException, LexicalSemanticResourceException { - String[] termsNode = node.getTerm().split(" "); - String[] lemmasNode = node.getLemma().split(" "); - int n = words.length; - Stack ngramTerm = new Stack<>(); - Stack ngramLemma = new Stack<>(); - - return findPotentialNgram(0, 1, n, termsNode, lemmasNode, ngramTerm, ngramLemma, words, lemmas, analizer, syny, thr); + private boolean extractNGram(String node, String[] lemmas, boolean syny) throws SimilarityException, LexicalSemanticResourceException { + String[] lemmasNode = node.split(" "); + int n = lemmas.length; + System.out.println("LEMMAS"); + for (String o:lemmas) { + System.out.println(o); + } + System.out.println("NODE LEMMAS"); + for (String o:lemmasNode) { + System.out.println(o); + } + + if (!syny) { + for (int i = 0; i < n; ++i) { + if (!lemmas[i].equals(lemmasNode[i])) return false; + } + } + else { + for (int i = 0; i < n; ++i) { + if (!isSynonym(lemmas[i],lemmasNode[i])) return false; + } + } + return true; + } + + private boolean isSynonym(String lemma, String s) { + return false; } /** @@ -291,31 +315,27 @@ private boolean extractNGram(Node node, String[] words, String[] lemmas, NLPAnal * concepts), and store the requirement within the related ontology class if * they matches with a concept of the ontology. * - * @param topNodes + * @param ngrams * @param reqId * @param requirement - * @param analizer * @param syny - * @param thr * @throws IOException * @throws SimilarityException * @throws LexicalSemanticResourceException */ - public void matching(List topNodes, String reqId, String requirement, NLPAnalyser analizer, boolean syny, - double thr) throws IOException, SimilarityException, LexicalSemanticResourceException { + public void matching(List ngrams, String reqId, String requirement, boolean syny) throws IOException, SimilarityException, LexicalSemanticResourceException { ArrayList classes = new ArrayList<>(); - String[] words; String[] lemmas; - for (int i = 0; i < topNodes.size(); i++) { + for (int i = 0; i < ngrams.size(); i++) { for (int j = 0; j < ontClasses.size(); j++) { - words = classesWords.get(j); lemmas = classesLemmas.get(j); - if (topNodes.get(i).getTerm().split(" ").length >= words.length && extractNGram(topNodes.get(i), words, lemmas, analizer, syny, thr)) classes.add(ontClasses.get(j)); + if (ngrams.get(i).split(" ").length == lemmas.length && extractNGram(ngrams.get(i), lemmas, syny)) classes.add(ontClasses.get(j)); } } // Requirement instantiation within the ontology for (OntClass cls : classes) { + System.out.println(cls.getLocalName()); Individual individual = this.model.createIndividual(this.source + ":" + reqId + "_" + cls.getLocalName(), cls); DatatypeProperty req = this.model.getDatatypeProperty(this.source + "#requirement"); @@ -328,20 +348,7 @@ public void matching(List topNodes, String reqId, String requirement, NLPA } /** - * Extract lemmas from ontology classes - * @param words - * @param analizer - * @return - * @throws IOException - * @throws UIMAException - */ - private String[] extractLemmas(String words, NLPAnalyser analizer) throws IOException, UIMAException { - String ontLemma = analizer.lemmatization(words); - return ontLemma.split(" "); - } - - /** - * Analyze the ontology and extract dependncies + * Analyze the ontology and extract dependencies * @return */ public List ontConflictDetection() { diff --git a/src/main/java/com/gessi/dependency_detection/functionalities/RAKEKeywordExtractor.java b/src/main/java/com/gessi/dependency_detection/functionalities/RAKEKeywordExtractor.java new file mode 100644 index 0000000..5f71495 --- /dev/null +++ b/src/main/java/com/gessi/dependency_detection/functionalities/RAKEKeywordExtractor.java @@ -0,0 +1,137 @@ +package com.gessi.dependency_detection.functionalities; + +import com.gessi.dependency_detection.domain.Requirement; +import com.linguistic.rake.Rake; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.custom.CustomAnalyzer; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; + +import javax.swing.plaf.basic.BasicInternalFrameTitlePane; +import java.io.IOException; +import java.io.StringReader; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +public class RAKEKeywordExtractor { + private Double cutoff = 3.0; + private TextPreprocessing preprocess = new TextPreprocessing(); + private Map>> wordOrder=new HashMap<>(); + + /** + * Passes the text through Lucene's token analyzer + * @param text Text to clean + * @param analyzer Analyzer to use + * @return Returns a cleaned list of strings + */ + public static List getAnalyzedStrings(String text, Analyzer analyzer) throws IOException { + List result=new ArrayList<>(); + TokenStream tokenStream = analyzer.tokenStream(null, new StringReader(text)); + CharTermAttribute attr = tokenStream.addAttribute(CharTermAttribute.class); + tokenStream.reset(); + while (tokenStream.incrementToken()) { + result.add(attr.toString()); + } + return result; + } + + + /** + * Extracts keywords using RAKE algorithm from a given corpus + * @param corpus Corpus to be used for RAKE + * @return Returns a list of maps, compromised of + */ + public List> extractKeywords(List corpus) throws IOException { + List> res = new ArrayList<>(); + Rake rake = new Rake(); + for (Requirement s : corpus) { + String text = ""; + int index=0; + Map> wordOrderInferior=new HashMap<>(); + for (String k : RAKEanalyzeNoStopword(s.getDescription())) { + if (wordOrderInferior.containsKey(k)) { + List order=wordOrderInferior.get(k); + order.add(index); + wordOrderInferior.put(k,order); + } + else { + List order=new ArrayList<>(); + order.add(index); + wordOrderInferior.put(k,order); + } + index++; + text = text + " " + k; + } + wordOrder.put(s.getId(),wordOrderInferior); + Map aux = rake.getKeywordsFromText(text); + String sum = ""; + for (String j : aux.keySet()) { + Double val = aux.get(j); + if (val >= cutoff) sum = sum + " " + j; + } + List result = RAKEanalyze(sum); + Map helper = new HashMap<>(); + for (String i : result) { + helper.put(i, aux.get(i)); + } + res.add(helper); + } + return res; + } + + /** + * Extracts skills using RAKE algorithm + * @param corpus Requirement corpus to be analyzed + * @return Returns a map of maps, compromised by > + */ + public Map computeRake(List corpus) throws IOException { + List> res = extractKeywords(corpus); + Map processedRequirements=new HashMap<>(); + int counter=0; + for (Requirement r: corpus) { + String newText=""; + for (String s:res.get(counter).keySet()) { + newText=newText+" "+s; + } + processedRequirements.put(r.getId(),newText); + ++counter; + } + return processedRequirements; + } + + /** + * Cleans text + * @param text Text to clean + * @return Returns a cleaned list of strings + */ + List RAKEanalyze(String text) throws IOException { + text = preprocess.text_preprocess(text); + Analyzer analyzer = CustomAnalyzer.builder() + .withTokenizer("standard") + .addTokenFilter("lowercase") + .addTokenFilter("stop") + .addTokenFilter("kstem") + .build(); + return getAnalyzedStrings(text, analyzer); + } + + /** + * Cleans text for RAKE algorithm to use + * @param text Text to clean + * @return Returns a cleaned list of strings + */ + List RAKEanalyzeNoStopword(String text) throws IOException { + Analyzer analyzer = CustomAnalyzer.builder() + .withTokenizer("standard") + .addTokenFilter("lowercase") + .addTokenFilter("kstem") + .build(); + return getAnalyzedStrings(text, analyzer); + } + + public Map>> getWordOrder() { + return wordOrder; + } +} diff --git a/src/main/java/com/gessi/dependency_detection/functionalities/TFIDFKeywordExtractor.java b/src/main/java/com/gessi/dependency_detection/functionalities/TFIDFKeywordExtractor.java new file mode 100644 index 0000000..5184785 --- /dev/null +++ b/src/main/java/com/gessi/dependency_detection/functionalities/TFIDFKeywordExtractor.java @@ -0,0 +1,188 @@ +package com.gessi.dependency_detection.functionalities; + +import com.gessi.dependency_detection.domain.Requirement; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.custom.CustomAnalyzer; + +import java.io.IOException; +import java.util.*; +import java.util.concurrent.ExecutionException; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + + +public class TFIDFKeywordExtractor { + + private Double cutoffParameter=4.0; //This can be set to different values for different selectivity (more or less keywords) + private HashMap corpusFrequency = new HashMap<>(); + private TextPreprocessing text_preprocess = new TextPreprocessing(); + private Map>> wordOrder=new HashMap<>(); + + + /** + * Computes the term frequency of each word in the text, and updates the Idf, + * @param doc List of strings to analyze + * @return Returns a map identified by + */ + private Map tf(List doc) { + Map frequency = new HashMap<>(); + for (String s : doc) { + if (frequency.containsKey(s)) frequency.put(s, frequency.get(s) + 1); + else { + frequency.put(s, 1); + if (corpusFrequency.containsKey(s)) corpusFrequency.put(s, corpusFrequency.get(s) + 1); + else corpusFrequency.put(s, 1); + } + + } + return frequency; + } + + private double idf(Integer size, Integer frequency) { + return StrictMath.log(size.doubleValue() / frequency.doubleValue() + 1.0); + } + + /** + * Preprocesses the text + * @param text Text to preprocess + * @param analyzer Analyzer to use + * @return Returns a list of cleaned strings + */ + private List analyze(String text, Analyzer analyzer,String reqId) throws IOException { + text = clean_text(text,reqId); + return RAKEKeywordExtractor.getAnalyzedStrings(text, analyzer); + } + + /** + * Preprocesses the text + * @param text Text to preprocess + * @return Returns a list of cleaned strings + */ + private List englishAnalyze(String text,String reqId) throws IOException { + Analyzer analyzer = CustomAnalyzer.builder() + .withTokenizer("standard") + .addTokenFilter("lowercase") + .addTokenFilter("stop") + .addTokenFilter("kstem") + .build(); + return analyze(text, analyzer, reqId); + } + + /** + * Computes Tf-Idf on a corpus of requirements + * @param corpus Corpus to be used for tf-idf + * @return Returns a map of maps, compromised of > + */ + public Map computeTFIDF(List corpus) throws IOException, ExecutionException, InterruptedException { + List> trueDocs = new ArrayList<>(); + for (Requirement r : corpus) { + List s = englishAnalyze(r.getDescription(),r.getId()); + trueDocs.add(s); + } + List> res = tfIdf(trueDocs); + Map processedRequirements=new HashMap<>(); + int counter=0; + for (Requirement r: corpus) { + String newText=""; + for (String s:res.get(counter).keySet()) { + newText=newText+" "+s; + } + processedRequirements.put(r.getId(),newText); + } + return processedRequirements; + + } + /** + * Computes Tf-Idf on a list of lists + * @param docs Corpus to be used for Tf-Idf + * @return Returns a list of maps, compromised by + */ + private List> tfIdf(List> docs) { + List> tfidfComputed = new ArrayList<>(); + List> wordBag = new ArrayList<>(); + for (List doc : docs) { + wordBag.add(tf(doc)); + } + int counter = 0; + for (List doc : docs) { + HashMap aux = new HashMap<>(); + for (String s : new TreeSet<>(doc)) { + Double idf = idf(docs.size(), corpusFrequency.get(s)); + Integer tf = wordBag.get(counter).get(s); + Double tfidf = idf * tf; + if (tfidf >= cutoffParameter && s.length() > 1) { + aux.put(s, tfidf); + } + } + ++counter; + tfidfComputed.add(aux); + } + return tfidfComputed; + + } + + /** + * Preprocesses the text and adds two special rules to help keyword extraction, these are that any word entirely in capital letters is to be made a keyword, + * and that any word between [] is to be made a keyword + * @param text Text to preprocess + * @return Returns a list of cleaned strings + */ + private String clean_text(String text,String reqId) throws IOException { + text = text_preprocess.text_preprocess(text); + String result = ""; + if (text.contains("[")) { + Pattern p = Pattern.compile("\\[(.*?)\\]"); + Matcher m = p.matcher(text); + while (m.find()) { + text = text + " " + m.group().toUpperCase(); + } + } + int index=0; + Map> wordOrderInterior=new HashMap<>(); + for (String a : text.split(" ")) { + if (wordOrderInterior.containsKey(a)) { + List order=wordOrderInterior.get(a); + order.add(index); + wordOrderInterior.put(a,order); + } + else { + List order=new ArrayList<>(); + order.add(index); + wordOrderInterior.put(a,order); + } + index++; + String helper = ""; + if (a.toUpperCase().equals(a)) { + for (int i = 0; i < 10; ++i) { + helper = helper.concat(" " + a); + } + a = helper; + } + result = result.concat(" " + a); + } + wordOrder.put(reqId,wordOrderInterior); + return result; + } + + + public HashMap getCorpusFrequency() { + return corpusFrequency; + } + + public void setCorpusFrequency(HashMap corpusFrequency) { + this.corpusFrequency = corpusFrequency; + } + + public Double getCutoffParameter() { + return cutoffParameter; + } + + public void setCutoffParameter(Double cutoffParameter) { + this.cutoffParameter = cutoffParameter; + } + + + public Map>> getWordOrder() { + return wordOrder; + } +} diff --git a/src/main/java/com/gessi/dependency_detection/functionalities/TextPreprocessing.java b/src/main/java/com/gessi/dependency_detection/functionalities/TextPreprocessing.java new file mode 100644 index 0000000..7410fc2 --- /dev/null +++ b/src/main/java/com/gessi/dependency_detection/functionalities/TextPreprocessing.java @@ -0,0 +1,62 @@ +package com.gessi.dependency_detection.functionalities; + +import org.springframework.stereotype.Service; + +import java.io.BufferedReader; +import java.io.FileReader; +import java.io.IOException; +import java.util.HashSet; +import java.util.Set; + +@Service +public class TextPreprocessing { + + Set exclusions = null; + + /** + * Preprocesses the text by removing stopwords and characters that hold no semantic meaning, or would worsen the semantic analysis + * @param text The string to preprocess + * @return The preprocessed text + */ + public String text_preprocess(String text) throws IOException { + String trueRes = ""; + if (text != null) { + text = text.replaceAll("(\\{.*?})", " code "); + text = text.replaceAll("[$,;\\\"/:|!?=()><_{}'+%[0-9]]", " "); + text = text.replaceAll("] \\[", "]["); + + if (exclusions == null) { + BufferedReader reader = new BufferedReader(new FileReader("src/main/resources/ExcludedWords.txt")); + String word = null; + exclusions = new HashSet<>(); + + while ((word = reader.readLine()) != null) { + exclusions.add(word); + } + reader.close(); + } + for (String l : text.split(" ")) { + if (!(l.toLowerCase().equals("null") && !l.equals("null") && !l.equals("Null")) && !l.toUpperCase().equals(l)) l = l.toLowerCase(); + if (l != null && !exclusions.contains(l) && l.length() > 1) { + String[] aux=l.split("\\."); + if (!(aux.length>1 && (aux[1]==null|| aux[0]==null || aux[0].equals("")&&aux[1].equals("") || aux[0].equals(" ")|| aux[1].equals(" ")))) { + if (aux.length > 1) { + String repeatingWord = aux[0]; + l = aux[0] + " " + aux[0]; + for (int i = 1; i < aux.length; ++i) { + repeatingWord = repeatingWord + "." + aux[i]; + l = l + "." + aux[i]; + if (i != (aux.length - 1)) l = l + " " + repeatingWord; + } + } + } + else l=l.replace(".",""); + trueRes = trueRes.concat(l + " "); + } + } + } + return trueRes; + + } + +} diff --git a/src/main/java/com/gessi/dependency_detection/service/DependencyService.java b/src/main/java/com/gessi/dependency_detection/service/DependencyService.java index 4ffc1be..d4e7950 100644 --- a/src/main/java/com/gessi/dependency_detection/service/DependencyService.java +++ b/src/main/java/com/gessi/dependency_detection/service/DependencyService.java @@ -8,6 +8,7 @@ import java.util.List; import java.util.Map; import java.util.Map.Entry; +import java.util.concurrent.ExecutionException; import org.apache.uima.UIMAException; import org.apache.uima.resource.ResourceInitializationException; @@ -181,31 +182,25 @@ public void loadOntology() throws IOException { * @throws IOException * @throws ResourceInitializationException * @throws UIMAException - * @throws dkpro.similarity.algorithms.api.SimilarityException + * @throws dkpro.similarity.algorithms.api.SimilarityException * @throws LexicalSemanticResourceException */ public ObjectNode conflictDependencyDetection(String projectId, boolean syny, double thr) throws IOException, ResourceInitializationException, UIMAException, - dkpro.similarity.algorithms.api.SimilarityException, LexicalSemanticResourceException { + dkpro.similarity.algorithms.api.SimilarityException, LexicalSemanticResourceException, ExecutionException, InterruptedException { // analyse the ontology classes - ontHandler.searchClasses(analizer); + int maxSize=ontHandler.searchClasses(analizer); // read the requirements from JSON Map requirements = jsonHandler.readRequirement(json, projectId); // foreach requirement + Map> syntxResutls=analizer.prepareRequirements(requirements,maxSize); for (Entry entry : requirements.entrySet()) { - String key = entry.getKey(); - String value = entry.getValue(); - if (key != null && value != null && !value.equals("")) { - // Apply NLP methods (syntactic approach) - List syntxResutls = analizer.requirementAnalysis(value); - - // Matching of extracted terms with the ontology, it is also applied the semantic appraoch - ontHandler.matching(syntxResutls, key, value, analizer, syny, thr); - } + ontHandler.matching(syntxResutls.get(entry.getKey()), entry.getKey(), entry.getValue(), syny); } // Extract dependencies from the ontology List deps = ontHandler.ontConflictDetection(); + System.out.println(deps.size()); return jsonHandler.storeDependencies(json, deps); } } diff --git a/src/main/resources/ExcludedWords.txt b/src/main/resources/ExcludedWords.txt new file mode 100644 index 0000000..5aad907 --- /dev/null +++ b/src/main/resources/ExcludedWords.txt @@ -0,0 +1,1075 @@ +x +y +your +yours +yourself +yourselves +you +yond +yonder +yon +ye +yet +z +zillion +j +u +umpteen +usually +us +username +uponed +upons +uponing +upon +ups +upping +upped +up +unto +until +unless +unlike +unliker +unlikest +under +underneath +use +used +usedest +r +rath +rather +rathest +rathe +re +relate +related +relatively +regarding +really +res +respecting +respectively +q +quite +que +qua +n +neither +neaths +neath +nethe +nethermost +necessary +necessariest +necessarier +never +nevertheless +nigh +nighest +nigher +nine +noone +nobody +nobodies +nowhere +nowheres +no +noes +nor +nos +no-one +none +not +notwithstanding +nothings +nothing +nathless +natheless +t +ten +tills +till +tilled +tilling +to +towards +toward +towardest +towarder +together +too +thy +thyself +thus +than +that +those +thou +though +thous +thouses +thoroughest +thorougher +thorough +thoroughly +thru +thruer +thruest +thro +through +throughout +throughest +througher +thine +this +thises +they +thee +the +then +thence +thenest +thener +them +themselves +these +therer +there +thereby +therest +thereafter +therein +thereupon +therefore +their +theirs +thing +things +three +two +o +oh +owt +owning +owned +own +owns +others +other +otherwise +otherwisest +otherwiser +of +often +oftener +oftenest +off +offs +offest +one +ought +oughts +our +ours +ourselves +ourself +out +outest +outed +outwith +outs +outside +over +overallest +overaller +overalls +overall +overs +or +orer +orest +on +oneself +onest +ons +onto +a +atween +at +athwart +atop +afore +afterward +afterwards +after +afterest +afterer +ain +an +any +anything +anybody +anyone +anyhow +anywhere +anent +anear +and +andor +another +around +ares +are +aest +aer +against +again +accordingly +abaft +abafter +abaftest +abovest +above +abover +abouter +aboutest +about +aid +amidst +amid +among +amongst +apartest +aparter +apart +appeared +appears +appear +appearing +appropriating +appropriate +appropriatest +appropriates +appropriater +appropriated +already +always +also +along +alongside +although +almost +all +allest +aller +allyou +alls +albeit +awfully +as +aside +asides +aslant +ases +astrider +astride +astridest +astraddlest +astraddler +astraddle +availablest +availabler +available +aughts +aught +vs +v +variousest +variouser +various +via +vis-a-vis +vis-a-viser +vis-a-visest +viz +very +veriest +verier +versus +k +g +go +gone +good +got +gotta +gotten +get +gets +getting +b +by +byandby +by-and-by +bist +both +but +buts +be +beyond +because +became +becomes +become +becoming +becomings +becominger +becomingest +behind +behinds +before +beforehand +beforehandest +beforehander +bettered +betters +better +bettering +betwixt +between +beneath +been +below +besides +beside +m +my +myself +mucher +muchest +much +must +musts +musths +musth +main +make +mayest +many +mauger +maugre +me +meanwhiles +meanwhile +mostly +most +moreover +more +might +mights +midst +midsts +h +huh +humph +he +hers +herself +her +hereby +herein +hereafters +hereafter +hereupon +hence +hadst +had +having +haves +have +has +hast +hardly +hae +hath +him +himself +hither +hitherest +hitherer +his +how-do-you-do +however +how +howbeit +howdoyoudo +hoos +hoo +w +woulded +woulding +would +woulds +was +wast +we +wert +were +with +withal +without +within +why +what +whatever +whateverer +whateverest +whatsoeverer +whatsoeverest +whatsoever +whence +whencesoever +whenever +whensoever +when +whenas +whether +wheen +whereto +whereupon +wherever +whereon +whereof +where +whereby +wherewithal +wherewith +whereinto +wherein +whereafter +whereas +wheresoever +wherefrom +which +whichever +whichsoever +whilst +while +whiles +whithersoever +whither +whosoever +whoso +whomever +s +syne +syn +shalling +shalled +shalls +shoulding +should +shoulded +shoulds +she +sayyid +sayid +said +saider +saidest +same +samest +sames +samer +saved +sans +sanses +sanserifs +sanserif +so +soer +soest +sobeit +someone +somebody +somehow +some +somewhere +somewhat +something +sometimest +sometimes +sometimer +sometime +several +severaler +severalest +serious +seriousest +seriouser +senza +send +sent +seem +seems +seemed +seemingest +seeminger +seemings +seven +summat +sups +sup +supping +supped +such +since +sine +sines +sith +six +stop +stopped +p +plaintiff +plenty +plenties +please +pleased +pleases +per +perhaps +particulars +particularly +particular +particularest +particularer +pro +providing +provides +provided +provide +probably +l +layabout +layabouts +latter +latterest +latterer +latterly +latters +lots +lotting +lotted +lot +lest +less +ie +ifs +if +i +info +information +itself +its +it +is +idem +idemer +idemest +immediate +immediately +immediatest +immediater +in +inwards +inwardest +inwarder +inward +inasmuch +into +instead +insofar +indicates +indicated +indicate +indicating +indeed +inc +f +fact +facts +fs +figupon +figupons +figuponing +figuponed +few +fewer +fewest +frae +from +failing +failings +five +furthers +furtherer +furthered +furtherest +further +furthering +furthermore +fourscore +followthrough +for +forwhy +fornenst +formerly +former +formerer +formerest +formers +forbye +forby +fore +forever +forer +fores +four +d +ddays +dday +do +doing +doings +doe +does +doth +downwarder +downwardest +downward +downwards +downs +done +doner +dones +donest +dos +dost +did +differentest +differenter +different +describing +describe +describes +described +despiting +despites +despited +despite +during +c +cum +circa +chez +cer +certain +certainest +certainer +cest +canst +cannot +cant +cants +canting +cantest +canted +co +could +couldst +comeon +comeons +come-ons +come-on +concerning +concerninger +concerningest +consequently +considering +e +eg +eight +either +even +evens +evenser +evensest +evened +evenest +ever +everyone +everything +everybody +everywhere +every +ere +each +et +etc +elsewhere +else +ex +excepted +excepts +except +excepting +exes +enough +ins +able +abst +accordance +according +across +act +actually +added +adj +affected +affecting +affects +ah +alone +am +announce +anymore +anyway +anyways +apparently +approximately +aren +arent +arise +ask +asking +auth +away +back +begin +beginning +beginnings +begins +being +believe +biol +brief +briefly +ca +came +can +can't +cause +causes +certainly +com +come +comes +contain +containing +contains +couldnt +date +didn't +doesn't +don't +down +due +ed +edu +effect +eighty +end +ending +especially +et-al +far +ff +fifth +first +fix +followed +following +follows +forth +found +gave +give +given +gives +giving +goes +happens +hasn't +haven't +hed +here +heres +hes +hi +hid +home +hundred +id +i'll +im +importance +important +index +invention +isn't +itd +it'll +i've +just +keep +keeps +kept +kg +km +know +known +knows +largely +last +lately +later +least +let +lets +like +liked +likely +line +little +'ll +look +looking +looks +ltd +made +mainly +makes +may +maybe +mean +means +meantime +merely +mg +million +miss +ml +mr +mrs +mug +na +name +namely +nay +nd +near +nearly +necessarily +need +needs +new +next +ninety +non +nonetheless +normally +noted +now +obtain +obtained +obviously +ok +okay +old +omitted +once +ones +only +ord +owing +page +pages +part +past +placed +plus +poorly +possible +possibly +potentially +pp +predominantly +present +previously +primarily +promptly +proud +put +quickly +qv +ran +rd +readily +recent +recently +ref +refs +regardless +regards +research +resulted +resulting +results +right +run +saw +say +saying +says +sec +section +see +seeing +seeming +seen +self +selves +shall +shed +she'll +shes +shouldn't +show +showed +shown +showns +shows +significant +significantly +similar +similarly +slightly +somethan +somethin +soon +sorry +specifically +specified +specify +specifying +still +strongly +sub +substantially +successfully +sufficiently +suggest +sure +take +taken +taking +tell +tends +th +thank +thanks +thanx +that'll +thats +that've +thered +there'll +thereof +therere +theres +thereto +there've +theyd +they'll +theyre +they've +think +thoughh +thousand +throug +til +tip +took +tried +tries +truly +try +trying +ts +twice +un +unfortunately +unlikely +useful +usefully +usefulness +uses +using +value +'ve +vol +vols +want +wants +wasnt +way +wed +welcome +we'll +went +werent +we've +what'll +whats +wheres +whim +who +whod +whoever +whole +who'll +whom +whos +whose +widely +willing +wish +wont +words +world +wouldnt +www +yes +youd +you'll +youre +you've +zero +due +don +out +only +what \ No newline at end of file diff --git a/src/main/resources/LexSemResources/wordnet/wordnet_properties.xml b/src/main/resources/LexSemResources/wordnet/wordnet_properties.xml index 9091f8e..5e37713 100644 --- a/src/main/resources/LexSemResources/wordnet/wordnet_properties.xml +++ b/src/main/resources/LexSemResources/wordnet/wordnet_properties.xml @@ -1,56 +1,56 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/src/main/resources/de.tudarmstadt.ukp.dkpro.lexsemresource.core.ResourceFactory/resources.xml b/src/main/resources/de.tudarmstadt.ukp.dkpro.lexsemresource.core.ResourceFactory/resources.xml index 69a753c..43dfb74 100644 --- a/src/main/resources/de.tudarmstadt.ukp.dkpro.lexsemresource.core.ResourceFactory/resources.xml +++ b/src/main/resources/de.tudarmstadt.ukp.dkpro.lexsemresource.core.ResourceFactory/resources.xml @@ -8,7 +8,7 @@ - + + + de.jungblut.glove + glove + 0.3 + + diff --git a/src/main/java/com/gessi/dependency_detection/WordEmbedding.java b/src/main/java/com/gessi/dependency_detection/WordEmbedding.java new file mode 100644 index 0000000..aee70f1 --- /dev/null +++ b/src/main/java/com/gessi/dependency_detection/WordEmbedding.java @@ -0,0 +1,56 @@ +package com.gessi.dependency_detection; + +import de.jungblut.glove.GloveRandomAccessReader; +import de.jungblut.glove.impl.GloveBinaryRandomAccessReader; +import de.jungblut.math.DoubleVector; + +import java.io.IOException; +import java.nio.file.Paths; + +import static java.lang.Math.sqrt; + +public class WordEmbedding { + + GloveRandomAccessReader db = new GloveBinaryRandomAccessReader(Paths.get("gloveModel")); + + public WordEmbedding() throws IOException { + } + + + /** + * Computes the cosine similarity between two words, if these vectors exist in the underlying Glove model + * @param a first word + * @param b second word + * @return The cosine similarity between the two words + */ + public Double computeSimilarity(String a, String b) throws IOException { + DoubleVector help1 = null, help2 = null; + if (db.contains(a)) help1 = db.get(a); + if (db.contains(b)) help2 = db.get(b); + if (help1 != null && help2 != null) { + return cosineSimilarity(help1,help2); + } else return -1.0; + } + + + private Double cosineSimilarity(DoubleVector help1, DoubleVector help2) { + double[] one=help1.toArray(); + double[] two=help2.toArray(); + int length=one.length; + Double sum = 0.0; + if (two.length>length) length=two.length; + for (int i=0;i> prepareRequirements(Map requirements, int maxSize) throws InterruptedException, ExecutionException, IOException { + public Map prepareRequirements(Map requirements, int maxSize) throws InterruptedException, ExecutionException, IOException { List recs=new ArrayList<>(); for (String s:requirements.keySet()) { recs.add(new com.gessi.dependency_detection.domain.Requirement(s,requirements.get(s))); @@ -66,36 +66,14 @@ public Map> prepareRequirements(Map requirem if (requirements.keySet().size()>100) { TFIDFKeywordExtractor extractor=new TFIDFKeywordExtractor(); keywords=extractor.computeTFIDF(recs); - wordOrder=extractor.getWordOrder(); } else { RAKEKeywordExtractor extractor=new RAKEKeywordExtractor(); keywords=extractor.computeRake(recs); - wordOrder=extractor.getWordOrder(); } - return getNgrams(keywords,wordOrder,maxSize); + return keywords; } - private Map> getNgrams(Map keywords, Map>> wordOrder, int maxSize) { - Map> result=new HashMap<>(); - for (String s:keywords.keySet()) { - TreeMap orderedKeywords=new TreeMap<>(); - for (String k:keywords.get(s).split(" ")) { - if (wordOrder.get(s).containsKey(k)) { - for (Integer i : wordOrder.get(s).get(k)) { - orderedKeywords.put(i, k); - } - } - } - List ordered=new ArrayList<>(); - for (String o:orderedKeywords.values()) { - ordered.add(o); - } - List ngrams=ngrams(ordered,maxSize); - result.put(s,ngrams); - } - return result; - } private List ngrams(List ordered, int maxSize) { List result=new ArrayList<>(); diff --git a/src/main/java/com/gessi/dependency_detection/functionalities/OntologyHandler.java b/src/main/java/com/gessi/dependency_detection/functionalities/OntologyHandler.java index 88c57db..bb93edb 100644 --- a/src/main/java/com/gessi/dependency_detection/functionalities/OntologyHandler.java +++ b/src/main/java/com/gessi/dependency_detection/functionalities/OntologyHandler.java @@ -3,6 +3,7 @@ import java.io.IOException; import java.util.*; +import com.gessi.dependency_detection.WordEmbedding; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.S; import org.apache.log4j.varia.NullAppender; import org.apache.uima.UIMAException; @@ -101,174 +102,14 @@ public int searchClasses(NLPAnalyser analizer) throws IOException, UIMAException private String[] extractLemmas(String ontTerm) throws IOException { TextPreprocessing textPreprocessing=new TextPreprocessing(); String l=textPreprocessing.text_preprocess(ontTerm); - return l.split(" "); + RAKEKeywordExtractor rake=new RAKEKeywordExtractor(); + List resAnalysis=rake.RAKEanalyzeNoStopword(l); + String[] res=new String[resAnalysis.size()]; + return resAnalysis.toArray(res); } - /** - * Check the similarity between two terms - * - * @param reqTerm - * @param ontLemma - * @param analizer - * @param thr - * @return - * @throws SimilarityException - * @throws LexicalSemanticResourceException - */ - private boolean isSynonym(String reqTerm, String ontLemma, NLPAnalyser analizer, double thr) - throws SimilarityException, LexicalSemanticResourceException { - if (!ontLemma.matches("\\d+|\\W+")) { - if (!synonyms.get(ontLemma).contains(reqTerm) && !noSynonyms.get(ontLemma).contains(reqTerm)) { - if (analizer.semanticSimilarity(reqTerm, ontLemma) >= thr) { - synonyms.get(ontLemma).add(reqTerm); - - return true; - } else { - noSynonyms.get(ontLemma).add(reqTerm); - } - } else if (synonyms.get(ontLemma).contains(reqTerm)) { - return true; - } else if (noSynonyms.get(ontLemma).contains(reqTerm)) { - return false; - } - } - return false; - } - - /** - * Check if the req. term match with the term of the ontology - * - * @param term - * @param lemma - * @param ontWord - * @param ontLemma - * @return - * @throws SimilarityException - * @throws LexicalSemanticResourceException - */ - private boolean isSameTerm(String term, String lemma, String ontWord, String ontLemma) - throws SimilarityException, LexicalSemanticResourceException { - - if (term.equalsIgnoreCase(ontWord)) - return true; - if (lemma.equals(ontWord)) - return true; - if (lemma.equals(ontLemma)) - return true; - if (term.equalsIgnoreCase(ontLemma)) - return true; - if (term.toLowerCase().matches(ontWord + "s|es")) - return true; - if (lemma.matches(ontWord + "s|es")) - return true; - if (lemma.matches(ontLemma + "s|es")) - return true; - if (term.toLowerCase().matches(ontLemma + "s|es")) - return true; - return false; - } - - /** - * check if a ordered set of words is the same of the set of words of the - * ontology - * - * @param ngramTerm - * @param ngramLemma - * @param words - * @param lemmas - * @param analizer - * @param syny - * @param thr - * @return - * @throws SimilarityException - * @throws LexicalSemanticResourceException - */ - private boolean isSameNgram(Stack ngramTerm, Stack ngramLemma, String[] words, String[] lemmas, - NLPAnalyser analizer, boolean syny, double thr) - throws SimilarityException, LexicalSemanticResourceException { - boolean find = false; - ArrayList idxOntLemmaAnalized = new ArrayList<>(); - ArrayList idxReqLemmaAnalized = new ArrayList<>(); - for (int i = 0; i < ngramTerm.size(); i++) { - if (!find && i > 0) { - return false; - } - find = false; - int j = 0; - while (j< lemmas.length && j < words.length && !find) { - if (!idxOntLemmaAnalized.contains(j) - && isSameTerm(ngramTerm.get(i), ngramLemma.get(i), words[j], lemmas[j])) { - find = true; - idxReqLemmaAnalized.add(i); - idxOntLemmaAnalized.add(j); - } - j++; - } - } - - // of it is not detected, check the synonymy - /*if (!find && syny) { - - for (int i = 0; i < ngramLemma.size(); i++) { - if (!idxReqLemmaAnalized.contains(i)) { - if (!find && i > 0) { - return false; - } - find = false; - int j = 0; - while (j < lemmas.length && !find) { - if (!idxOntLemmaAnalized.contains(j) - && isSynonym(ngramLemma.get(i), lemmas[j], analizer, thr)) { - find = true; - idxOntLemmaAnalized.add(j); - } - j++; - } - } else find = true; - } - }*/ - return find; - } - - /** - * Find all the combinations of the n-gram to check if the req. concept matches - * with the ont. concept - * - * @param idx - * @param level - * @param n - * @param termsNode - * @param lemmasNode - * @param ngramTerm - * @param ngramLemma - * @param words - * @param lemmas - * @param analizer - * @param syny - * @param thr - * @return - * @throws SimilarityException - * @throws LexicalSemanticResourceException - */ - private boolean findPotentialNgram(int idx, int level, int n, String[] termsNode, String[] lemmasNode, - Stack ngramTerm, Stack ngramLemma, String[] words, String[] lemmas, NLPAnalyser analizer, - boolean syny, double thr) throws SimilarityException, LexicalSemanticResourceException { - boolean find = false; - for (int j = idx; j < termsNode.length && !find; j++) { - ngramTerm.push(termsNode[j]); - ngramLemma.push(lemmasNode[j]); - if (level < n) { - find = findPotentialNgram(j + 1, level + 1, n, termsNode, lemmasNode, ngramTerm, ngramLemma, words, - lemmas, analizer, syny, thr); - } - if (level == n && isSameNgram(ngramTerm, ngramLemma, words, lemmas, analizer, syny, thr)) return true; - ngramTerm.pop(); - ngramLemma.pop(); - } - return find; - } /** * Find if the set of words contains a correct n-gram that match with the @@ -281,41 +122,40 @@ private boolean findPotentialNgram(int idx, int level, int n, String[] termsNode * @throws SimilarityException * @throws LexicalSemanticResourceException */ - private boolean extractNGram(String node, String[] lemmas, boolean syny) throws SimilarityException, LexicalSemanticResourceException { + private boolean extractNGram(String node, String[] lemmas, boolean syny,double thr,WordEmbedding wordEmbedding) throws SimilarityException, LexicalSemanticResourceException, IOException { String[] lemmasNode = node.split(" "); - int n = lemmas.length; - System.out.println("LEMMAS"); - for (String o:lemmas) { - System.out.println(o); + Set nodeSet=new HashSet(Arrays.asList(lemmasNode)); + Set lemmaSet=new HashSet(Arrays.asList(lemmas)); + if (syny) { + return isSynonym(nodeSet,lemmaSet,thr,wordEmbedding); } - System.out.println("NODE LEMMAS"); - for (String o:lemmasNode) { - System.out.println(o); + else { + return nodeSet.containsAll(lemmaSet); } + } - if (!syny) { - for (int i = 0; i < n; ++i) { - if (!lemmas[i].equals(lemmasNode[i])) return false; - } - } - else { - for (int i = 0; i < n; ++i) { - if (!isSynonym(lemmas[i],lemmasNode[i])) return false; + private boolean isSynonym(Set requirementLemmas, Set ontologyLemmas,double thr,WordEmbedding wordEmbedding) throws IOException { + boolean isSynonym=true; + for (String s: ontologyLemmas) { + boolean synonymExists=false; + for (String l:requirementLemmas) { + if (wordEmbedding.computeSimilarity(s,l)>=thr) { + synonymExists=true; + break; + } } + isSynonym=isSynonym&&synonymExists; + if (!isSynonym) return false; } return true; } - private boolean isSynonym(String lemma, String s) { - return false; - } - /** * Analyze the potential term candidates extracted from the requirements (n-gram * concepts), and store the requirement within the related ontology class if * they matches with a concept of the ontology. * - * @param ngrams + * @param keywords * @param reqId * @param requirement * @param syny @@ -323,19 +163,23 @@ private boolean isSynonym(String lemma, String s) { * @throws SimilarityException * @throws LexicalSemanticResourceException */ - public void matching(List ngrams, String reqId, String requirement, boolean syny) throws IOException, SimilarityException, LexicalSemanticResourceException { + public void matching(String keywords, String reqId, String requirement, boolean syny,double thr,WordEmbedding wordEmbedding) throws IOException, SimilarityException, LexicalSemanticResourceException { ArrayList classes = new ArrayList<>(); String[] lemmas; - for (int i = 0; i < ngrams.size(); i++) { - for (int j = 0; j < ontClasses.size(); j++) { - lemmas = classesLemmas.get(j); - if (ngrams.get(i).split(" ").length == lemmas.length && extractNGram(ngrams.get(i), lemmas, syny)) classes.add(ontClasses.get(j)); + for (int j = 0; j < ontClasses.size(); j++) { + lemmas = classesLemmas.get(j); + if (keywords.split(" ").length >= lemmas.length && extractNGram(keywords, lemmas, syny,thr,wordEmbedding)) { + System.out.println("A MATCH WAS MADE BETWEEN:"); + System.out.println("REQUIREMENT KEYWORDS: "+keywords); + System.out.println("ONTOLOGY NAME: "+lemmas.toString()); + + classes.add(ontClasses.get(j)); } } // Requirement instantiation within the ontology for (OntClass cls : classes) { - System.out.println(cls.getLocalName()); + System.out.println("A MATCH WAS MADE"); Individual individual = this.model.createIndividual(this.source + ":" + reqId + "_" + cls.getLocalName(), cls); DatatypeProperty req = this.model.getDatatypeProperty(this.source + "#requirement"); diff --git a/src/main/java/com/gessi/dependency_detection/functionalities/RAKEKeywordExtractor.java b/src/main/java/com/gessi/dependency_detection/functionalities/RAKEKeywordExtractor.java index 5f71495..90c67e8 100644 --- a/src/main/java/com/gessi/dependency_detection/functionalities/RAKEKeywordExtractor.java +++ b/src/main/java/com/gessi/dependency_detection/functionalities/RAKEKeywordExtractor.java @@ -122,7 +122,7 @@ List RAKEanalyze(String text) throws IOException { * @param text Text to clean * @return Returns a cleaned list of strings */ - List RAKEanalyzeNoStopword(String text) throws IOException { + public List RAKEanalyzeNoStopword(String text) throws IOException { Analyzer analyzer = CustomAnalyzer.builder() .withTokenizer("standard") .addTokenFilter("lowercase") diff --git a/src/main/java/com/gessi/dependency_detection/service/DependencyService.java b/src/main/java/com/gessi/dependency_detection/service/DependencyService.java index d4e7950..0c3e1ce 100644 --- a/src/main/java/com/gessi/dependency_detection/service/DependencyService.java +++ b/src/main/java/com/gessi/dependency_detection/service/DependencyService.java @@ -10,6 +10,7 @@ import java.util.Map.Entry; import java.util.concurrent.ExecutionException; +import com.gessi.dependency_detection.WordEmbedding; import org.apache.uima.UIMAException; import org.apache.uima.resource.ResourceInitializationException; import org.springframework.beans.factory.annotation.Autowired; @@ -194,9 +195,10 @@ public ObjectNode conflictDependencyDetection(String projectId, boolean syny, do // read the requirements from JSON Map requirements = jsonHandler.readRequirement(json, projectId); // foreach requirement - Map> syntxResutls=analizer.prepareRequirements(requirements,maxSize); + Map syntxResutls=analizer.prepareRequirements(requirements,maxSize); + WordEmbedding wordEmbedding=new WordEmbedding();// Declared here so it won't initialize every time for (Entry entry : requirements.entrySet()) { - ontHandler.matching(syntxResutls.get(entry.getKey()), entry.getKey(), entry.getValue(), syny); + ontHandler.matching(syntxResutls.get(entry.getKey()), entry.getKey(), entry.getValue(), syny,thr,wordEmbedding); } // Extract dependencies from the ontology List deps = ontHandler.ontConflictDetection(); diff --git a/src/main/resources/ExcludedWords.txt b/src/main/resources/ExcludedWords.txt index 5aad907..5fc76a1 100644 --- a/src/main/resources/ExcludedWords.txt +++ b/src/main/resources/ExcludedWords.txt @@ -718,7 +718,6 @@ every ere each et -etc elsewhere else ex diff --git a/src/test/java/com/gessi/dependency_detection/AppTest.java b/src/test/java/com/gessi/dependency_detection/AppTest.java index 3031165..cebf557 100644 --- a/src/test/java/com/gessi/dependency_detection/AppTest.java +++ b/src/test/java/com/gessi/dependency_detection/AppTest.java @@ -58,7 +58,7 @@ public void Success() throws Exception { "application/json", jsonFile.toString().getBytes()); - this.mockMvc.perform(MockMvcRequestBuilders.fileUpload("/upc/dependency-detection/json/ontology/ABC?synonymy=false&threshold=0.1") + this.mockMvc.perform(MockMvcRequestBuilders.fileUpload("/upc/dependency-detection/json/ontology/ABC?synonymy=true&threshold=0.1") .file(ontology) .file(json)) .andExpect(status().isOk()); From d0f6edda3c7537f2922b7dbe0cf11413f21aaf1f Mon Sep 17 00:00:00 2001 From: Quim Date: Tue, 3 Dec 2019 13:51:33 +0100 Subject: [PATCH 4/8] add keywordTool to controller + WIP restore old algorithm --- .../controller/Controller.java | 7 +++-- .../domain/KeywordTool.java | 8 ++++++ .../service/DependencyService.java | 26 ++++++++++++++----- 3 files changed, 32 insertions(+), 9 deletions(-) create mode 100644 src/main/java/com/gessi/dependency_detection/domain/KeywordTool.java diff --git a/src/main/java/com/gessi/dependency_detection/controller/Controller.java b/src/main/java/com/gessi/dependency_detection/controller/Controller.java index d080941..48a596c 100644 --- a/src/main/java/com/gessi/dependency_detection/controller/Controller.java +++ b/src/main/java/com/gessi/dependency_detection/controller/Controller.java @@ -8,6 +8,7 @@ import javax.validation.constraints.NotNull; import javax.ws.rs.QueryParam; +import com.gessi.dependency_detection.domain.KeywordTool; import com.gessi.dependency_detection.util.Control; import org.apache.uima.UIMAException; import org.apache.uima.resource.ResourceInitializationException; @@ -79,7 +80,9 @@ public ResponseEntity uploadJSONFile( @ApiParam(value = "The JSON file to upload", required = true) @RequestPart("json") @Valid String json, @ApiParam(value = "Id of the project where the requirements to analize are.", required = true) @PathVariable("projectId") String projectId, @ApiParam(value = "If true, semantic similarity (synonymy) detection is applied to improve the detection algorithm.", required = true) @RequestParam(value = "synonymy", required = true) Boolean synonymy, - @ApiParam(value = "Threshold of semantic similarity to detect synonyms (included).", required = false) @RequestParam(value = "threshold", required = false) Double threshold) + @ApiParam(value = "Threshold of semantic similarity to detect synonyms (included).", required = false) @RequestParam(value = "threshold", required = false) Double threshold, + @ApiParam(value = "Keyword extraction tool (RULE_BASED or TFIDF_BASED)", required = false) @RequestParam(value = "keywordTool", required = false, + defaultValue = "RULE_BASED") KeywordTool keywordTool) throws IOException, InterruptedException { Control.getInstance().showInfoMessage("Start computing"); ObjectNode onjN = null; @@ -102,7 +105,7 @@ public ResponseEntity uploadJSONFile( // apply the dependency detection onjN = depService.conflictDependencyDetection(projectId, synonymy, - threshold); + threshold, keywordTool); /* Delete the uploaded file */ depService.deleteAll(); diff --git a/src/main/java/com/gessi/dependency_detection/domain/KeywordTool.java b/src/main/java/com/gessi/dependency_detection/domain/KeywordTool.java new file mode 100644 index 0000000..341083e --- /dev/null +++ b/src/main/java/com/gessi/dependency_detection/domain/KeywordTool.java @@ -0,0 +1,8 @@ +package com.gessi.dependency_detection.domain; + +public enum KeywordTool { + + RULE_BASED, + TFIDF_BASED + +} diff --git a/src/main/java/com/gessi/dependency_detection/service/DependencyService.java b/src/main/java/com/gessi/dependency_detection/service/DependencyService.java index 0c3e1ce..30d2e54 100644 --- a/src/main/java/com/gessi/dependency_detection/service/DependencyService.java +++ b/src/main/java/com/gessi/dependency_detection/service/DependencyService.java @@ -5,12 +5,14 @@ import java.nio.file.Path; import java.nio.file.Paths; import java.nio.file.StandardCopyOption; +import java.util.ArrayList; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.concurrent.ExecutionException; import com.gessi.dependency_detection.WordEmbedding; +import com.gessi.dependency_detection.domain.KeywordTool; import org.apache.uima.UIMAException; import org.apache.uima.resource.ResourceInitializationException; import org.springframework.beans.factory.annotation.Autowired; @@ -186,7 +188,7 @@ public void loadOntology() throws IOException { * @throws dkpro.similarity.algorithms.api.SimilarityException * @throws LexicalSemanticResourceException */ - public ObjectNode conflictDependencyDetection(String projectId, boolean syny, double thr) + public ObjectNode conflictDependencyDetection(String projectId, boolean syny, double thr, KeywordTool keywordTool) throws IOException, ResourceInitializationException, UIMAException, dkpro.similarity.algorithms.api.SimilarityException, LexicalSemanticResourceException, ExecutionException, InterruptedException { @@ -195,13 +197,23 @@ public ObjectNode conflictDependencyDetection(String projectId, boolean syny, do // read the requirements from JSON Map requirements = jsonHandler.readRequirement(json, projectId); // foreach requirement - Map syntxResutls=analizer.prepareRequirements(requirements,maxSize); - WordEmbedding wordEmbedding=new WordEmbedding();// Declared here so it won't initialize every time - for (Entry entry : requirements.entrySet()) { - ontHandler.matching(syntxResutls.get(entry.getKey()), entry.getKey(), entry.getValue(), syny,thr,wordEmbedding); + + List deps = new ArrayList<>(); + + if (keywordTool.equals(KeywordTool.TFIDF_BASED)) { + Map syntxResutls = analizer.prepareRequirements(requirements, maxSize); + WordEmbedding wordEmbedding = new WordEmbedding();// Declared here so it won't initialize every time + for (Entry entry : requirements.entrySet()) { + ontHandler.matching(syntxResutls.get(entry.getKey()), entry.getKey(), entry.getValue(), syny, thr, wordEmbedding); + } + // Extract dependencies from the ontology + deps = ontHandler.ontConflictDetection(); } - // Extract dependencies from the ontology - List deps = ontHandler.ontConflictDetection(); + + else if (keywordTool.equals(KeywordTool.RULE_BASED)) { + //TODO old method + } + System.out.println(deps.size()); return jsonHandler.storeDependencies(json, deps); } From f03e6b013bf1530db83353488b0832fd02bf753d Mon Sep 17 00:00:00 2001 From: Antoni Casas Date: Tue, 3 Dec 2019 15:49:19 +0100 Subject: [PATCH 5/8] Adds RULE_BASED and TF_IDF option to differentiate between algorithms --- .../functionalities/NLPAnalyser.java | 324 ++++- .../functionalities/OntologyHandler.java | 268 +++- .../functionalities/RAKEKeywordExtractor.java | 18 - .../TFIDFKeywordExtractor.java | 18 - .../service/DependencyService.java | 22 +- src/main/resources/ExcludedWords.txt | 1074 ----------------- .../wordnet/wordnet_properties.xml | 112 +- .../resources.xml | 2 +- .../gessi/dependency_detection/AppTest.java | 2 +- 9 files changed, 653 insertions(+), 1187 deletions(-) delete mode 100644 src/main/resources/ExcludedWords.txt diff --git a/src/main/java/com/gessi/dependency_detection/functionalities/NLPAnalyser.java b/src/main/java/com/gessi/dependency_detection/functionalities/NLPAnalyser.java index 992a500..8165a7d 100644 --- a/src/main/java/com/gessi/dependency_detection/functionalities/NLPAnalyser.java +++ b/src/main/java/com/gessi/dependency_detection/functionalities/NLPAnalyser.java @@ -3,10 +3,19 @@ import java.io.BufferedReader; import java.io.IOException; import java.io.FileReader; +import java.io.InputStream; import java.util.*; import java.util.concurrent.ExecutionException; +import com.gessi.dependency_detection.components.Node; import com.gessi.dependency_detection.util.Control; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; +import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency; +import de.tudarmstadt.ukp.dkpro.core.clearnlp.ClearNlpLemmatizer; +import de.tudarmstadt.ukp.dkpro.core.clearnlp.ClearNlpParser; +import de.tudarmstadt.ukp.dkpro.core.clearnlp.ClearNlpPosTagger; import dkpro.similarity.algorithms.api.SimilarityException; import de.tudarmstadt.ukp.dkpro.lexsemresource.LexicalSemanticResource; @@ -15,12 +24,36 @@ import de.tudarmstadt.ukp.dkpro.lexsemresource.exception.ResourceLoaderException; import dkpro.similarity.algorithms.lsr.LexSemResourceComparator; import dkpro.similarity.algorithms.lsr.path.WuPalmerComparator; +import opennlp.tools.sentdetect.SentenceDetectorME; +import opennlp.tools.sentdetect.SentenceModel; +import opennlp.tools.tokenize.Tokenizer; +import opennlp.tools.tokenize.TokenizerME; +import opennlp.tools.tokenize.TokenizerModel; +import org.apache.uima.UIMAException; +import org.apache.uima.analysis_engine.AnalysisEngine; +import org.apache.uima.analysis_engine.AnalysisEngineDescription; +import org.apache.uima.fit.testing.factory.TokenBuilder; +import org.apache.uima.fit.util.JCasUtil; +import org.apache.uima.jcas.JCas; +import org.apache.uima.resource.ResourceInitializationException; +import org.springframework.core.io.ClassPathResource; + +import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngine; +import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; public class NLPAnalyser { + private static ClassPathResource sentPath = new ClassPathResource("en-sent.bin"); + private static ClassPathResource tokenPath = new ClassPathResource("en-token.bin"); + + private SentenceDetectorME sentenceDetector = null; private LexSemResourceComparator comparatorWN = null; + private AnalysisEngineDescription tagger = null; + private AnalysisEngineDescription lemma = null; + private AnalysisEngine parserEngine = null; + private AnalysisEngine lemmaEngine = null; private LexicalSemanticResource wordnet; /** @@ -56,13 +89,12 @@ public double semanticSimilarity(String term1, String term2) } - public Map prepareRequirements(Map requirements, int maxSize) throws InterruptedException, ExecutionException, IOException { + public Map prepareRequirements(Map requirements) throws InterruptedException, ExecutionException, IOException { List recs=new ArrayList<>(); for (String s:requirements.keySet()) { recs.add(new com.gessi.dependency_detection.domain.Requirement(s,requirements.get(s))); } Map keywords; - Map>> wordOrder; if (requirements.keySet().size()>100) { TFIDFKeywordExtractor extractor=new TFIDFKeywordExtractor(); keywords=extractor.computeTFIDF(recs); @@ -74,19 +106,46 @@ public Map prepareRequirements(Map requirements, return keywords; } + public List requirementAnalysis(String requirement) + throws IOException, ResourceInitializationException, UIMAException { + + ArrayList synResult = new ArrayList<>(); - private List ngrams(List ordered, int maxSize) { - List result=new ArrayList<>(); - for (int i=0;i readFile(String path) { Control.getInstance().showErrorMessage(e.getMessage()); } return fileLines; - }} \ No newline at end of file + } +//--------------------------------------------------------------RULE BASED + + /** + * Noisy text cleaning + * Rule-based method + * It uses regexp to replace and remove noisy text + * @param sentence + * @return + */ + private String clearSentence(String sentence) { + // (\(LC\))$ + sentence = sentence.replaceAll("^\t", ""); + sentence = sentence.replaceAll("\\.(\\s)?(\\((\\w|\\W)+\\))$", "."); + sentence = sentence.replaceAll("^(\\s)?(\\((\\w|\\W)+\\))$", "."); + // ^(\(?[a-zA-Z](\)|\.)(\t)?)|^(\(?\d+((\.\d+)+)?(\)|\.)?(\t)?)|^[\u2022,\u2023,\u25E6,\u2043,\u2219,\W]+(\s|\t)? + sentence = sentence.replaceAll( + "^(\\(?[a-zA-Z](\\)|\\.)(\\t)?)|^(\\(?\\d+((\\.\\d+)+)?(\\)|\\.)?(\\t)?)|^[\\u2022,\\u2023,\\u25E6,\\u2043,\\u2219,\\W]+(\\s|\\t)?", + ""); + // + sentence = sentence.replaceAll("^(\\(?(ix|iv|v?i{1,3}|x?i{1,3})\\)?)(?![a-zA-Z]).?(\\t|\\s)?", ""); + // |^(RBC)\s\d+(\.)?\s? + sentence = sentence.replaceAll( + "^(NOTE)\\s\\d+(\\.)?\\s?|^(RBC)\\s\\d+(\\.)?\\s?|^(OBU)\\s\\d+(\\.)?\\s?|^(EA)\\s\\d+(\\.)?\\s?|^(CE)\\s\\d+(\\.)?\\s?|^(GEN)\\s\\d+(\\.)?\\s?|^(LED)\\s\\d+(\\.)?\\s?", + ""); + // \\\/ + sentence = sentence.replaceAll("\\/", " / "); + + // \w+('s)(?!\w) + sentence = sentence.replaceAll("('s)(?!\\w)", " 's"); + // parentheses, quotation marks + sentence = sentence.replaceAll("\\(", " ( ").replaceAll("\\)", " ) ").replaceAll("[\"“”]", " \" "); + // \.(\s) -> used for separate the end point if the sentence detection don't + // split the phrase correctly + sentence = sentence.replaceAll("\\.(\\s)", " . "); + sentence = sentence.replaceAll("\\s+", " "); + + // Check the endpoint of the sentence + if (sentence.length() > 1) { + if (sentence.substring(sentence.length() - 1).equals(";") + || sentence.substring(sentence.length() - 1).equals(",") + || sentence.substring(sentence.length() - 1).equals(":")) { + sentence = sentence.substring(0, sentence.length() - 1); + } + if (!sentence.substring(sentence.length() - 1).equals(".")) { + sentence = sentence + "."; + } + } + return sentence; + } + + /** + * Tokenization (OpenNLP) + * @param requirmenet + * @return + * @throws IOException + */ + public String[] tokenization(String requirmenet) throws IOException { + InputStream inputFile = null; + inputFile = tokenPath.getInputStream(); + TokenizerModel model = new TokenizerModel(inputFile); + Tokenizer tokenizer = new TokenizerME(model); + return tokenizer.tokenize(requirmenet); + } + + /** + * Sentence Boundary Disambiguation (SBD) (openNLP) + * @param sentence + * @return + * @throws IOException + */ + public String[] sentenceDetection(String sentence) throws IOException { + String[] sentences = null; + // Loading sentence detector model + InputStream inputStream = null; + if (sentenceDetector == null) { + inputStream = sentPath.getInputStream(); + } + try { + if (sentenceDetector == null) { + SentenceModel model = new SentenceModel(inputStream); + + // Instantiating the SentenceDetectorME class + sentenceDetector = new SentenceDetectorME(model); + } + // Detecting the sentence + sentences = sentenceDetector.sentDetect(sentence); + } catch (IOException e) { + Control.getInstance().showErrorMessage(e.getMessage()); + } finally { + if (inputStream != null) { + try { + inputStream.close(); + } catch (IOException e) { + Control.getInstance().showErrorMessage(e.getMessage()); + } + } + } + return sentences; + } + + /** + * Used to set the requirement text into the pipeline + * @param aEngine + * @param aLanguage + * @param aText + * @return + * @throws UIMAException + */ + public static JCas runParser(AnalysisEngine aEngine, String aLanguage, String aText) throws UIMAException { + + JCas jcas = aEngine.newJCas(); + + jcas.setDocumentLanguage(aLanguage); + + TokenBuilder tb = new TokenBuilder<>(Token.class, Sentence.class); + tb.buildTokens(jcas, aText); + + aEngine.process(jcas); + + return jcas; + } + /** + * Dependency parser engine (clearNLP) + * This function generates a dependency tree from the dependency parser results. + * @param aText + * @return + * @throws ResourceInitializationException + * @throws UIMAException + * @throws IOException + */ + public Node dependencyParser(String aText) throws ResourceInitializationException, UIMAException, IOException { + if (parserEngine == null) { + parserEngine = createEngine(createEngineDescription(createEngineDescription(tagger, lemma), + createEngineDescription(ClearNlpParser.class))); + } + JCas jcas = runParser(parserEngine, "en", aText); + Node root = null; + ArrayList dependencyTree = new ArrayList<>(); + Collection deps = JCasUtil.select(jcas, Dependency.class); + if (!deps.isEmpty()) { + for (Dependency d : deps) { + Node node = new Node(d.getDependent().getBegin(), d.getGovernor().getBegin(), + d.getDependent().getPosValue(), d.getDependencyType(), d.getDependent().getCoveredText(), + d.getDependent().getLemmaValue(), d); + dependencyTree.add(node); + } + + root = fillTreeLinks(dependencyTree); + } + return root; + } + /** + * Update the tree information + * @param tree + * @return + */ + private Node fillTreeLinks(ArrayList tree) { + Node root = null; + for (Node n : tree) { + if (n.getParentId() > n.getId()) { + int pIdx = findParent(tree, n.getParentId(), tree.indexOf(n) + 1, n.getParentId() > n.getId()); + tree.get(pIdx).addSonNodes(n); + + } else if (n.getParentId() < n.getId()) { + int pIdx = findParent(tree, n.getParentId(), tree.indexOf(n) - 1, n.getParentId() > n.getId()); + tree.get(pIdx).addSonNodes(n); + } else { + root = n; + } + } + return root; + } + /** + * Find the parent of the node from the dependncy parser results + * @param tree + * @param parentId + * @param idx + * @param next + * @return + */ + private int findParent(ArrayList tree, int parentId, int idx, boolean next) { + boolean find = false; + while (!find) { + if (tree.get(idx).getId() == parentId) { + find = true; + } else if (next) { + idx++; + } else { + idx--; + } + } + return idx; + } + + /** + * Lemmatization engine (clearNLP) + * @param term + * @return + * @throws UIMAException + */ + public String lemmatization(String term) throws UIMAException { + if (lemmaEngine == null && tagger == null && lemma == null) { + + tagger = createEngineDescription(ClearNlpPosTagger.class); + lemma = createEngineDescription(ClearNlpLemmatizer.class); + + lemmaEngine = createEngine(createEngineDescription(tagger, lemma)); + } + JCas jcas = runParser(lemmaEngine, "en", term); + Collection lemmas = JCasUtil.select(jcas, Lemma.class); + String ret = ""; + String[] terms = term.split(" "); + int i = 0; + if (!lemmas.isEmpty()) { + for (Lemma l : lemmas) { + if (!l.getValue().matches("\\d+")) { + if (!ret.equals("")) + ret = ret.concat(" " + l.getValue()); + else + ret = l.getValue(); + } else { + if (!ret.equals("")) + ret = ret.concat(" " + terms[i]); + else + ret = terms[i]; + } + i++; + } + } + return ret; + } + + + + + + + +} \ No newline at end of file diff --git a/src/main/java/com/gessi/dependency_detection/functionalities/OntologyHandler.java b/src/main/java/com/gessi/dependency_detection/functionalities/OntologyHandler.java index bb93edb..81ece8b 100644 --- a/src/main/java/com/gessi/dependency_detection/functionalities/OntologyHandler.java +++ b/src/main/java/com/gessi/dependency_detection/functionalities/OntologyHandler.java @@ -4,6 +4,7 @@ import java.util.*; import com.gessi.dependency_detection.WordEmbedding; +import com.gessi.dependency_detection.components.Node; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.S; import org.apache.log4j.varia.NullAppender; import org.apache.uima.UIMAException; @@ -58,12 +59,11 @@ public void loadOnt(String source, String path) throws IOException { /** * Analyse ontology classes and extract its infromation (terms, lemmas) * - * @param analizer * @throws IOException * @throws UIMAException * @return */ - public int searchClasses(NLPAnalyser analizer) throws IOException, UIMAException { + public int searchClassesTfIdfBased() throws IOException, UIMAException { ontClasses = new ArrayList<>(); classesWords = new ArrayList<>(); classesLemmas = new ArrayList<>(); @@ -99,6 +99,48 @@ public int searchClasses(NLPAnalyser analizer) throws IOException, UIMAException return max; } + /** + * Analyse ontology classes and extract its infromation (terms, lemmas) + * + * @param analizer + * @throws IOException + * @throws UIMAException + */ + public void searchClasses(NLPAnalyser analizer) throws IOException, UIMAException { + ontClasses = new ArrayList<>(); + classesWords = new ArrayList<>(); + classesLemmas = new ArrayList<>(); + ExtendedIterator rootClasses = this.model.listClasses(); + while (rootClasses.hasNext()) { + OntClass thisClass = (OntClass) rootClasses.next(); + + if (thisClass.getLocalName() != null) { + String ontTerm = ""; + String[] words = thisClass.getLocalName() + .split("_|\\s|(?()); + noSynonyms.put(lemmas[i], new ArrayList<>()); + } + } + } + } + + private String[] extractLemmas(String ontTerm) throws IOException { TextPreprocessing textPreprocessing=new TextPreprocessing(); String l=textPreprocessing.text_preprocess(ontTerm); @@ -107,6 +149,10 @@ private String[] extractLemmas(String ontTerm) throws IOException { String[] res=new String[resAnalysis.size()]; return resAnalysis.toArray(res); } + private String[] extractLemmas(String words, NLPAnalyser analizer) throws IOException, UIMAException { + String ontLemma = analizer.lemmatization(words); + return ontLemma.split(" "); + } @@ -190,6 +236,48 @@ public void matching(String keywords, String reqId, String requirement, boolean individual.setPropertyValue(className, this.model.createTypedLiteral(cls.getLocalName())); } } + /** + * Analyze the potential term candidates extracted from the requirements (n-gram + * concepts), and store the requirement within the related ontology class if + * they matches with a concept of the ontology. + * + * @param topNodes + * @param reqId + * @param requirement + * @param analizer + * @param syny + * @param thr + * @throws IOException + * @throws SimilarityException + * @throws LexicalSemanticResourceException + */ + + public void matchingRuleBased(List topNodes, String reqId, String requirement, NLPAnalyser analizer, boolean syny, + double thr) throws IOException, SimilarityException, LexicalSemanticResourceException { + ArrayList classes = new ArrayList<>(); + String[] words; + String[] lemmas; + for (int i = 0; i < topNodes.size(); i++) { + for (int j = 0; j < ontClasses.size(); j++) { + words = classesWords.get(j); + lemmas = classesLemmas.get(j); + if (topNodes.get(i).getTerm().split(" ").length >= words.length && extractNGramRuleBased(topNodes.get(i), words, lemmas, analizer, syny, thr)) classes.add(ontClasses.get(j)); + } + } + + // Requirement instantiation within the ontology + for (OntClass cls : classes) { + Individual individual = this.model.createIndividual(this.source + ":" + reqId + "_" + cls.getLocalName(), + cls); + DatatypeProperty req = this.model.getDatatypeProperty(this.source + "#requirement"); + individual.setPropertyValue(req, this.model.createTypedLiteral(requirement)); + DatatypeProperty id = this.model.getDatatypeProperty(this.source + "#id"); + individual.setPropertyValue(id, this.model.createTypedLiteral(reqId)); + DatatypeProperty className = this.model.getDatatypeProperty(this.source + "#class"); + individual.setPropertyValue(className, this.model.createTypedLiteral(cls.getLocalName())); + } + } + /** * Analyze the ontology and extract dependencies @@ -278,5 +366,181 @@ private List displayRestriction(OntProperty property, Resource constrain result.add(constraint); return result; } + private boolean extractNGramRuleBased(Node node, String[] words, String[] lemmas, NLPAnalyser analizer, boolean syny, + double thr) throws SimilarityException, LexicalSemanticResourceException { + String[] termsNode = node.getTerm().split(" "); + String[] lemmasNode = node.getLemma().split(" "); + int n = words.length; + Stack ngramTerm = new Stack<>(); + Stack ngramLemma = new Stack<>(); + + return findPotentialNgram(0, 1, n, termsNode, lemmasNode, ngramTerm, ngramLemma, words, lemmas, analizer, syny, thr); + } + /** + * Find all the combinations of the n-gram to check if the req. concept matches + * with the ont. concept + * + * @param idx + * @param level + * @param n + * @param termsNode + * @param lemmasNode + * @param ngramTerm + * @param ngramLemma + * @param words + * @param lemmas + * @param analizer + * @param syny + * @param thr + * @return + * @throws SimilarityException + * @throws LexicalSemanticResourceException + */ + private boolean findPotentialNgram(int idx, int level, int n, String[] termsNode, String[] lemmasNode, + Stack ngramTerm, Stack ngramLemma, String[] words, String[] lemmas, NLPAnalyser analizer, + boolean syny, double thr) throws SimilarityException, LexicalSemanticResourceException { + boolean find = false; + for (int j = idx; j < termsNode.length && !find; j++) { + ngramTerm.push(termsNode[j]); + ngramLemma.push(lemmasNode[j]); + if (level < n) { + find = findPotentialNgram(j + 1, level + 1, n, termsNode, lemmasNode, ngramTerm, ngramLemma, words, + lemmas, analizer, syny, thr); + } + if (level == n && isSameNgram(ngramTerm, ngramLemma, words, lemmas, analizer, syny, thr)) return true; + ngramTerm.pop(); + ngramLemma.pop(); + } + return find; + } + + /** + * check if a ordered set of words is the same of the set of words of the + * ontology + * + * @param ngramTerm + * @param ngramLemma + * @param words + * @param lemmas + * @param analizer + * @param syny + * @param thr + * @return + * @throws SimilarityException + * @throws LexicalSemanticResourceException + */ + private boolean isSameNgram(Stack ngramTerm, Stack ngramLemma, String[] words, String[] lemmas, + NLPAnalyser analizer, boolean syny, double thr) + throws SimilarityException, LexicalSemanticResourceException { + boolean find = false; + ArrayList idxOntLemmaAnalized = new ArrayList<>(); + ArrayList idxReqLemmaAnalized = new ArrayList<>(); + for (int i = 0; i < ngramTerm.size(); i++) { + if (!find && i > 0) { + return false; + } + find = false; + int j = 0; + while (j < words.length && !find) { + if (!idxOntLemmaAnalized.contains(j) + && isSameTerm(ngramTerm.get(i), ngramLemma.get(i), words[j], lemmas[j])) { + find = true; + idxReqLemmaAnalized.add(i); + idxOntLemmaAnalized.add(j); + } + j++; + } + } + + // of it is not detected, check the synonymy + if (!find && syny) { + + for (int i = 0; i < ngramLemma.size(); i++) { + if (!idxReqLemmaAnalized.contains(i)) { + if (!find && i > 0) { + return false; + } + find = false; + int j = 0; + while (j < lemmas.length && !find) { + if (!idxOntLemmaAnalized.contains(j) + && isSynonymRuleBased(ngramLemma.get(i), lemmas[j], analizer, thr)) { + find = true; + idxOntLemmaAnalized.add(j); + } + j++; + } + } else find = true; + } + } + return find; + } + + /** + * Check if the req. term match with the term of the ontology + * + * @param term + * @param lemma + * @param ontWord + * @param ontLemma + * @return + * @throws SimilarityException + * @throws LexicalSemanticResourceException + */ + private boolean isSameTerm(String term, String lemma, String ontWord, String ontLemma) + throws SimilarityException, LexicalSemanticResourceException { + + if (term.equalsIgnoreCase(ontWord)) + return true; + if (lemma.equals(ontWord)) + return true; + if (lemma.equals(ontLemma)) + return true; + if (term.equalsIgnoreCase(ontLemma)) + return true; + + if (term.toLowerCase().matches(ontWord + "s|es")) + return true; + if (lemma.matches(ontWord + "s|es")) + return true; + if (lemma.matches(ontLemma + "s|es")) + return true; + if (term.toLowerCase().matches(ontLemma + "s|es")) + return true; + + return false; + } + + + /** + * Check the similarity between two terms + * + * @param reqTerm + * @param ontLemma + * @param analizer + * @param thr + * @return + * @throws SimilarityException + * @throws LexicalSemanticResourceException + */ + private boolean isSynonymRuleBased(String reqTerm, String ontLemma, NLPAnalyser analizer, double thr) + throws SimilarityException, LexicalSemanticResourceException { + if (!ontLemma.matches("\\d+|\\W+")) { + if (!synonyms.get(ontLemma).contains(reqTerm) && !noSynonyms.get(ontLemma).contains(reqTerm)) { + if (analizer.semanticSimilarity(reqTerm, ontLemma) >= thr) { + synonyms.get(ontLemma).add(reqTerm); + + return true; + } else { + noSynonyms.get(ontLemma).add(reqTerm); + } + } else if (synonyms.get(ontLemma).contains(reqTerm)) { + return true; + } else if (noSynonyms.get(ontLemma).contains(reqTerm)) { + return false; + } + } + return false; + } } diff --git a/src/main/java/com/gessi/dependency_detection/functionalities/RAKEKeywordExtractor.java b/src/main/java/com/gessi/dependency_detection/functionalities/RAKEKeywordExtractor.java index 90c67e8..62540ea 100644 --- a/src/main/java/com/gessi/dependency_detection/functionalities/RAKEKeywordExtractor.java +++ b/src/main/java/com/gessi/dependency_detection/functionalities/RAKEKeywordExtractor.java @@ -18,7 +18,6 @@ public class RAKEKeywordExtractor { private Double cutoff = 3.0; private TextPreprocessing preprocess = new TextPreprocessing(); - private Map>> wordOrder=new HashMap<>(); /** * Passes the text through Lucene's token analyzer @@ -48,23 +47,9 @@ public List> extractKeywords(List corpus) throw Rake rake = new Rake(); for (Requirement s : corpus) { String text = ""; - int index=0; - Map> wordOrderInferior=new HashMap<>(); for (String k : RAKEanalyzeNoStopword(s.getDescription())) { - if (wordOrderInferior.containsKey(k)) { - List order=wordOrderInferior.get(k); - order.add(index); - wordOrderInferior.put(k,order); - } - else { - List order=new ArrayList<>(); - order.add(index); - wordOrderInferior.put(k,order); - } - index++; text = text + " " + k; } - wordOrder.put(s.getId(),wordOrderInferior); Map aux = rake.getKeywordsFromText(text); String sum = ""; for (String j : aux.keySet()) { @@ -131,7 +116,4 @@ public List RAKEanalyzeNoStopword(String text) throws IOException { return getAnalyzedStrings(text, analyzer); } - public Map>> getWordOrder() { - return wordOrder; - } } diff --git a/src/main/java/com/gessi/dependency_detection/functionalities/TFIDFKeywordExtractor.java b/src/main/java/com/gessi/dependency_detection/functionalities/TFIDFKeywordExtractor.java index 5184785..64e0f24 100644 --- a/src/main/java/com/gessi/dependency_detection/functionalities/TFIDFKeywordExtractor.java +++ b/src/main/java/com/gessi/dependency_detection/functionalities/TFIDFKeywordExtractor.java @@ -16,7 +16,6 @@ public class TFIDFKeywordExtractor { private Double cutoffParameter=4.0; //This can be set to different values for different selectivity (more or less keywords) private HashMap corpusFrequency = new HashMap<>(); private TextPreprocessing text_preprocess = new TextPreprocessing(); - private Map>> wordOrder=new HashMap<>(); /** @@ -137,20 +136,7 @@ private String clean_text(String text,String reqId) throws IOException { text = text + " " + m.group().toUpperCase(); } } - int index=0; - Map> wordOrderInterior=new HashMap<>(); for (String a : text.split(" ")) { - if (wordOrderInterior.containsKey(a)) { - List order=wordOrderInterior.get(a); - order.add(index); - wordOrderInterior.put(a,order); - } - else { - List order=new ArrayList<>(); - order.add(index); - wordOrderInterior.put(a,order); - } - index++; String helper = ""; if (a.toUpperCase().equals(a)) { for (int i = 0; i < 10; ++i) { @@ -160,7 +146,6 @@ private String clean_text(String text,String reqId) throws IOException { } result = result.concat(" " + a); } - wordOrder.put(reqId,wordOrderInterior); return result; } @@ -182,7 +167,4 @@ public void setCutoffParameter(Double cutoffParameter) { } - public Map>> getWordOrder() { - return wordOrder; - } } diff --git a/src/main/java/com/gessi/dependency_detection/service/DependencyService.java b/src/main/java/com/gessi/dependency_detection/service/DependencyService.java index 30d2e54..5285c71 100644 --- a/src/main/java/com/gessi/dependency_detection/service/DependencyService.java +++ b/src/main/java/com/gessi/dependency_detection/service/DependencyService.java @@ -13,6 +13,7 @@ import com.gessi.dependency_detection.WordEmbedding; import com.gessi.dependency_detection.domain.KeywordTool; +import de.tudarmstadt.ukp.dkpro.lexsemresource.exception.ResourceLoaderException; import org.apache.uima.UIMAException; import org.apache.uima.resource.ResourceInitializationException; import org.springframework.beans.factory.annotation.Autowired; @@ -51,7 +52,7 @@ public class DependencyService { * @throws IOException */ @Autowired - public DependencyService(StorageProperties properties) throws IOException { + public DependencyService(StorageProperties properties) throws IOException, ResourceLoaderException { this.rootLocation = Paths.get(properties.getRootLocation()); this.ontLocation = Paths.get(properties.getOntLocation()); this.docLocation = Paths.get(properties.getDocLocation()); @@ -193,7 +194,8 @@ public ObjectNode conflictDependencyDetection(String projectId, boolean syny, do dkpro.similarity.algorithms.api.SimilarityException, LexicalSemanticResourceException, ExecutionException, InterruptedException { // analyse the ontology classes - int maxSize=ontHandler.searchClasses(analizer); + if (keywordTool.equals(KeywordTool.TFIDF_BASED)) ontHandler.searchClassesTfIdfBased(); + else ontHandler.searchClasses(analizer); // read the requirements from JSON Map requirements = jsonHandler.readRequirement(json, projectId); // foreach requirement @@ -201,7 +203,7 @@ public ObjectNode conflictDependencyDetection(String projectId, boolean syny, do List deps = new ArrayList<>(); if (keywordTool.equals(KeywordTool.TFIDF_BASED)) { - Map syntxResutls = analizer.prepareRequirements(requirements, maxSize); + Map syntxResutls = analizer.prepareRequirements(requirements); WordEmbedding wordEmbedding = new WordEmbedding();// Declared here so it won't initialize every time for (Entry entry : requirements.entrySet()) { ontHandler.matching(syntxResutls.get(entry.getKey()), entry.getKey(), entry.getValue(), syny, thr, wordEmbedding); @@ -211,7 +213,19 @@ public ObjectNode conflictDependencyDetection(String projectId, boolean syny, do } else if (keywordTool.equals(KeywordTool.RULE_BASED)) { - //TODO old method + for (Entry entry : requirements.entrySet()) { + String key = entry.getKey(); + String value = entry.getValue(); + if (key != null && value != null && !value.equals("")) { + // Apply NLP methods (syntactic approach) + List syntxResutls = analizer.requirementAnalysis(value); + + // Matching of extracted terms with the ontology, it is also applied the semantic appraoch + ontHandler.matchingRuleBased(syntxResutls, key, value, analizer, syny, thr); + } + } + // Extract dependencies from the ontology + deps = ontHandler.ontConflictDetection(); } System.out.println(deps.size()); diff --git a/src/main/resources/ExcludedWords.txt b/src/main/resources/ExcludedWords.txt deleted file mode 100644 index 5fc76a1..0000000 --- a/src/main/resources/ExcludedWords.txt +++ /dev/null @@ -1,1074 +0,0 @@ -x -y -your -yours -yourself -yourselves -you -yond -yonder -yon -ye -yet -z -zillion -j -u -umpteen -usually -us -username -uponed -upons -uponing -upon -ups -upping -upped -up -unto -until -unless -unlike -unliker -unlikest -under -underneath -use -used -usedest -r -rath -rather -rathest -rathe -re -relate -related -relatively -regarding -really -res -respecting -respectively -q -quite -que -qua -n -neither -neaths -neath -nethe -nethermost -necessary -necessariest -necessarier -never -nevertheless -nigh -nighest -nigher -nine -noone -nobody -nobodies -nowhere -nowheres -no -noes -nor -nos -no-one -none -not -notwithstanding -nothings -nothing -nathless -natheless -t -ten -tills -till -tilled -tilling -to -towards -toward -towardest -towarder -together -too -thy -thyself -thus -than -that -those -thou -though -thous -thouses -thoroughest -thorougher -thorough -thoroughly -thru -thruer -thruest -thro -through -throughout -throughest -througher -thine -this -thises -they -thee -the -then -thence -thenest -thener -them -themselves -these -therer -there -thereby -therest -thereafter -therein -thereupon -therefore -their -theirs -thing -things -three -two -o -oh -owt -owning -owned -own -owns -others -other -otherwise -otherwisest -otherwiser -of -often -oftener -oftenest -off -offs -offest -one -ought -oughts -our -ours -ourselves -ourself -out -outest -outed -outwith -outs -outside -over -overallest -overaller -overalls -overall -overs -or -orer -orest -on -oneself -onest -ons -onto -a -atween -at -athwart -atop -afore -afterward -afterwards -after -afterest -afterer -ain -an -any -anything -anybody -anyone -anyhow -anywhere -anent -anear -and -andor -another -around -ares -are -aest -aer -against -again -accordingly -abaft -abafter -abaftest -abovest -above -abover -abouter -aboutest -about -aid -amidst -amid -among -amongst -apartest -aparter -apart -appeared -appears -appear -appearing -appropriating -appropriate -appropriatest -appropriates -appropriater -appropriated -already -always -also -along -alongside -although -almost -all -allest -aller -allyou -alls -albeit -awfully -as -aside -asides -aslant -ases -astrider -astride -astridest -astraddlest -astraddler -astraddle -availablest -availabler -available -aughts -aught -vs -v -variousest -variouser -various -via -vis-a-vis -vis-a-viser -vis-a-visest -viz -very -veriest -verier -versus -k -g -go -gone -good -got -gotta -gotten -get -gets -getting -b -by -byandby -by-and-by -bist -both -but -buts -be -beyond -because -became -becomes -become -becoming -becomings -becominger -becomingest -behind -behinds -before -beforehand -beforehandest -beforehander -bettered -betters -better -bettering -betwixt -between -beneath -been -below -besides -beside -m -my -myself -mucher -muchest -much -must -musts -musths -musth -main -make -mayest -many -mauger -maugre -me -meanwhiles -meanwhile -mostly -most -moreover -more -might -mights -midst -midsts -h -huh -humph -he -hers -herself -her -hereby -herein -hereafters -hereafter -hereupon -hence -hadst -had -having -haves -have -has -hast -hardly -hae -hath -him -himself -hither -hitherest -hitherer -his -how-do-you-do -however -how -howbeit -howdoyoudo -hoos -hoo -w -woulded -woulding -would -woulds -was -wast -we -wert -were -with -withal -without -within -why -what -whatever -whateverer -whateverest -whatsoeverer -whatsoeverest -whatsoever -whence -whencesoever -whenever -whensoever -when -whenas -whether -wheen -whereto -whereupon -wherever -whereon -whereof -where -whereby -wherewithal -wherewith -whereinto -wherein -whereafter -whereas -wheresoever -wherefrom -which -whichever -whichsoever -whilst -while -whiles -whithersoever -whither -whosoever -whoso -whomever -s -syne -syn -shalling -shalled -shalls -shoulding -should -shoulded -shoulds -she -sayyid -sayid -said -saider -saidest -same -samest -sames -samer -saved -sans -sanses -sanserifs -sanserif -so -soer -soest -sobeit -someone -somebody -somehow -some -somewhere -somewhat -something -sometimest -sometimes -sometimer -sometime -several -severaler -severalest -serious -seriousest -seriouser -senza -send -sent -seem -seems -seemed -seemingest -seeminger -seemings -seven -summat -sups -sup -supping -supped -such -since -sine -sines -sith -six -stop -stopped -p -plaintiff -plenty -plenties -please -pleased -pleases -per -perhaps -particulars -particularly -particular -particularest -particularer -pro -providing -provides -provided -provide -probably -l -layabout -layabouts -latter -latterest -latterer -latterly -latters -lots -lotting -lotted -lot -lest -less -ie -ifs -if -i -info -information -itself -its -it -is -idem -idemer -idemest -immediate -immediately -immediatest -immediater -in -inwards -inwardest -inwarder -inward -inasmuch -into -instead -insofar -indicates -indicated -indicate -indicating -indeed -inc -f -fact -facts -fs -figupon -figupons -figuponing -figuponed -few -fewer -fewest -frae -from -failing -failings -five -furthers -furtherer -furthered -furtherest -further -furthering -furthermore -fourscore -followthrough -for -forwhy -fornenst -formerly -former -formerer -formerest -formers -forbye -forby -fore -forever -forer -fores -four -d -ddays -dday -do -doing -doings -doe -does -doth -downwarder -downwardest -downward -downwards -downs -done -doner -dones -donest -dos -dost -did -differentest -differenter -different -describing -describe -describes -described -despiting -despites -despited -despite -during -c -cum -circa -chez -cer -certain -certainest -certainer -cest -canst -cannot -cant -cants -canting -cantest -canted -co -could -couldst -comeon -comeons -come-ons -come-on -concerning -concerninger -concerningest -consequently -considering -e -eg -eight -either -even -evens -evenser -evensest -evened -evenest -ever -everyone -everything -everybody -everywhere -every -ere -each -et -elsewhere -else -ex -excepted -excepts -except -excepting -exes -enough -ins -able -abst -accordance -according -across -act -actually -added -adj -affected -affecting -affects -ah -alone -am -announce -anymore -anyway -anyways -apparently -approximately -aren -arent -arise -ask -asking -auth -away -back -begin -beginning -beginnings -begins -being -believe -biol -brief -briefly -ca -came -can -can't -cause -causes -certainly -com -come -comes -contain -containing -contains -couldnt -date -didn't -doesn't -don't -down -due -ed -edu -effect -eighty -end -ending -especially -et-al -far -ff -fifth -first -fix -followed -following -follows -forth -found -gave -give -given -gives -giving -goes -happens -hasn't -haven't -hed -here -heres -hes -hi -hid -home -hundred -id -i'll -im -importance -important -index -invention -isn't -itd -it'll -i've -just -keep -keeps -kept -kg -km -know -known -knows -largely -last -lately -later -least -let -lets -like -liked -likely -line -little -'ll -look -looking -looks -ltd -made -mainly -makes -may -maybe -mean -means -meantime -merely -mg -million -miss -ml -mr -mrs -mug -na -name -namely -nay -nd -near -nearly -necessarily -need -needs -new -next -ninety -non -nonetheless -normally -noted -now -obtain -obtained -obviously -ok -okay -old -omitted -once -ones -only -ord -owing -page -pages -part -past -placed -plus -poorly -possible -possibly -potentially -pp -predominantly -present -previously -primarily -promptly -proud -put -quickly -qv -ran -rd -readily -recent -recently -ref -refs -regardless -regards -research -resulted -resulting -results -right -run -saw -say -saying -says -sec -section -see -seeing -seeming -seen -self -selves -shall -shed -she'll -shes -shouldn't -show -showed -shown -showns -shows -significant -significantly -similar -similarly -slightly -somethan -somethin -soon -sorry -specifically -specified -specify -specifying -still -strongly -sub -substantially -successfully -sufficiently -suggest -sure -take -taken -taking -tell -tends -th -thank -thanks -thanx -that'll -thats -that've -thered -there'll -thereof -therere -theres -thereto -there've -theyd -they'll -theyre -they've -think -thoughh -thousand -throug -til -tip -took -tried -tries -truly -try -trying -ts -twice -un -unfortunately -unlikely -useful -usefully -usefulness -uses -using -value -'ve -vol -vols -want -wants -wasnt -way -wed -welcome -we'll -went -werent -we've -what'll -whats -wheres -whim -who -whod -whoever -whole -who'll -whom -whos -whose -widely -willing -wish -wont -words -world -wouldnt -www -yes -youd -you'll -youre -you've -zero -due -don -out -only -what \ No newline at end of file diff --git a/src/main/resources/LexSemResources/wordnet/wordnet_properties.xml b/src/main/resources/LexSemResources/wordnet/wordnet_properties.xml index 5e37713..9091f8e 100644 --- a/src/main/resources/LexSemResources/wordnet/wordnet_properties.xml +++ b/src/main/resources/LexSemResources/wordnet/wordnet_properties.xml @@ -1,56 +1,56 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/src/main/resources/de.tudarmstadt.ukp.dkpro.lexsemresource.core.ResourceFactory/resources.xml b/src/main/resources/de.tudarmstadt.ukp.dkpro.lexsemresource.core.ResourceFactory/resources.xml index 43dfb74..69a753c 100644 --- a/src/main/resources/de.tudarmstadt.ukp.dkpro.lexsemresource.core.ResourceFactory/resources.xml +++ b/src/main/resources/de.tudarmstadt.ukp.dkpro.lexsemresource.core.ResourceFactory/resources.xml @@ -8,7 +8,7 @@ - +