diff --git a/README.md b/README.md index b12e02f..6d13134 100644 --- a/README.md +++ b/README.md @@ -106,6 +106,10 @@ First of all it is necessary to download the following external files and depend - [WordNet ESA](https://docs.google.com/uc?export=download&id=1I6oQqIeZva1CwLA96OkHFSZKiBfUgWLe) - [WordNet LexSemResources](https://docs.google.com/uc?export=download&id=1TeYlsHbcCtxbsVVoBvttdVsvbKFHPbZn) +Seconf of all it is necessary to download the following file and extract its content into gloveModel/ folder (at the root of the service) + +- [GloveModel](https://drive.google.com/file/d/1E-jkanZQSjXAuwx3EXyGKAyMQ8QBWobA/view?usp=sharing) + Then is necessary to configure the DKPRO_HOME variable with the resources directory path: - export DKPRO_HOME=/path/dependency-detection/src/main/resources diff --git a/gloveModel/.gitignore b/gloveModel/.gitignore new file mode 100644 index 0000000..5e7d273 --- /dev/null +++ b/gloveModel/.gitignore @@ -0,0 +1,4 @@ +# Ignore everything in this directory +* +# Except this file +!.gitignore diff --git a/libs/linguistic/rake/1.0/LICENSE.txt b/libs/linguistic/rake/1.0/LICENSE.txt new file mode 100644 index 0000000..75dd38b --- /dev/null +++ b/libs/linguistic/rake/1.0/LICENSE.txt @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2018 Linguistic + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. \ No newline at end of file diff --git a/libs/linguistic/rake/1.0/rake-1.0.jar b/libs/linguistic/rake/1.0/rake-1.0.jar new file mode 100644 index 0000000..9c1f64b Binary files /dev/null and b/libs/linguistic/rake/1.0/rake-1.0.jar differ diff --git a/libs/linguistic/rake/1.0/rake-1.0.pom.xml b/libs/linguistic/rake/1.0/rake-1.0.pom.xml new file mode 100644 index 0000000..66a3a9a --- /dev/null +++ b/libs/linguistic/rake/1.0/rake-1.0.pom.xml @@ -0,0 +1,10 @@ + + + 4.0.0 + linguistic + rake + 1.0 + POM was created from install:install-file + \ No newline at end of file diff --git a/pom.xml b/pom.xml index ed9c673..3104b88 100644 --- a/pom.xml +++ b/pom.xml @@ -53,6 +53,33 @@ spring-boot-starter-log4j2 + + + org.apache.lucene + lucene-analyzers-common + 7.7.1 + + + org.apache.lucene + lucene-core + 7.7.1 + + + linguistic + rake + 1.0 + system + ${project.basedir}/libs/linguistic/rake/1.0/rake-1.0.jar + + + + + + de.jungblut.glove + glove + 0.3 + + diff --git a/src/main/java/com/gessi/dependency_detection/WordEmbedding.java b/src/main/java/com/gessi/dependency_detection/WordEmbedding.java new file mode 100644 index 0000000..aee70f1 --- /dev/null +++ b/src/main/java/com/gessi/dependency_detection/WordEmbedding.java @@ -0,0 +1,56 @@ +package com.gessi.dependency_detection; + +import de.jungblut.glove.GloveRandomAccessReader; +import de.jungblut.glove.impl.GloveBinaryRandomAccessReader; +import de.jungblut.math.DoubleVector; + +import java.io.IOException; +import java.nio.file.Paths; + +import static java.lang.Math.sqrt; + +public class WordEmbedding { + + GloveRandomAccessReader db = new GloveBinaryRandomAccessReader(Paths.get("gloveModel")); + + public WordEmbedding() throws IOException { + } + + + /** + * Computes the cosine similarity between two words, if these vectors exist in the underlying Glove model + * @param a first word + * @param b second word + * @return The cosine similarity between the two words + */ + public Double computeSimilarity(String a, String b) throws IOException { + DoubleVector help1 = null, help2 = null; + if (db.contains(a)) help1 = db.get(a); + if (db.contains(b)) help2 = db.get(b); + if (help1 != null && help2 != null) { + return cosineSimilarity(help1,help2); + } else return -1.0; + } + + + private Double cosineSimilarity(DoubleVector help1, DoubleVector help2) { + double[] one=help1.toArray(); + double[] two=help2.toArray(); + int length=one.length; + Double sum = 0.0; + if (two.length>length) length=two.length; + for (int i=0;i(createException(e.toString(),"NLP Error"), HttpStatus.INTERNAL_SERVER_ERROR); } catch (SimilarityException | LexicalSemanticResourceException e) { return new ResponseEntity<>(createException(e.toString(),"Similarity Error"), HttpStatus.INTERNAL_SERVER_ERROR); + } catch (ExecutionException e) { + e.printStackTrace(); } return new ResponseEntity<>(onjN, HttpStatus.OK); } diff --git a/src/main/java/com/gessi/dependency_detection/domain/KeywordTool.java b/src/main/java/com/gessi/dependency_detection/domain/KeywordTool.java new file mode 100644 index 0000000..341083e --- /dev/null +++ b/src/main/java/com/gessi/dependency_detection/domain/KeywordTool.java @@ -0,0 +1,8 @@ +package com.gessi.dependency_detection.domain; + +public enum KeywordTool { + + RULE_BASED, + TFIDF_BASED + +} diff --git a/src/main/java/com/gessi/dependency_detection/domain/Requirement.java b/src/main/java/com/gessi/dependency_detection/domain/Requirement.java new file mode 100644 index 0000000..68d1f8f --- /dev/null +++ b/src/main/java/com/gessi/dependency_detection/domain/Requirement.java @@ -0,0 +1,27 @@ +package com.gessi.dependency_detection.domain; + +public class Requirement { + String description; + String id; + + public Requirement(String s, String s1) { + description=s1; + id=s; + } + + public String getDescription() { + return description; + } + + public void setDescription(String description) { + this.description = description; + } + + public String getId() { + return id; + } + + public void setId(String id) { + this.id = id; + } +} diff --git a/src/main/java/com/gessi/dependency_detection/functionalities/NLPAnalyser.java b/src/main/java/com/gessi/dependency_detection/functionalities/NLPAnalyser.java index f64e6fc..8165a7d 100644 --- a/src/main/java/com/gessi/dependency_detection/functionalities/NLPAnalyser.java +++ b/src/main/java/com/gessi/dependency_detection/functionalities/NLPAnalyser.java @@ -1,22 +1,22 @@ package com.gessi.dependency_detection.functionalities; -import java.io.IOException; -import java.io.InputStream; import java.io.BufferedReader; +import java.io.IOException; import java.io.FileReader; -import java.util.ArrayList; -import java.util.Collection; -import java.util.List; +import java.io.InputStream; +import java.util.*; +import java.util.concurrent.ExecutionException; +import com.gessi.dependency_detection.components.Node; import com.gessi.dependency_detection.util.Control; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; +import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency; +import de.tudarmstadt.ukp.dkpro.core.clearnlp.ClearNlpLemmatizer; +import de.tudarmstadt.ukp.dkpro.core.clearnlp.ClearNlpParser; +import de.tudarmstadt.ukp.dkpro.core.clearnlp.ClearNlpPosTagger; import dkpro.similarity.algorithms.api.SimilarityException; -import org.apache.uima.UIMAException; -import org.apache.uima.analysis_engine.AnalysisEngine; -import org.apache.uima.analysis_engine.AnalysisEngineDescription; -import org.apache.uima.fit.testing.factory.TokenBuilder; -import org.apache.uima.fit.util.JCasUtil; -import org.apache.uima.jcas.JCas; -import org.apache.uima.resource.ResourceInitializationException; import de.tudarmstadt.ukp.dkpro.lexsemresource.LexicalSemanticResource; import de.tudarmstadt.ukp.dkpro.lexsemresource.core.ResourceFactory; @@ -24,27 +24,23 @@ import de.tudarmstadt.ukp.dkpro.lexsemresource.exception.ResourceLoaderException; import dkpro.similarity.algorithms.lsr.LexSemResourceComparator; import dkpro.similarity.algorithms.lsr.path.WuPalmerComparator; - -import org.springframework.core.io.ClassPathResource; - -import com.gessi.dependency_detection.components.Node; - -import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; -import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngine; - -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; -import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.*; - -import de.tudarmstadt.ukp.dkpro.core.clearnlp.ClearNlpLemmatizer; -import de.tudarmstadt.ukp.dkpro.core.clearnlp.ClearNlpParser; -import de.tudarmstadt.ukp.dkpro.core.clearnlp.ClearNlpPosTagger; import opennlp.tools.sentdetect.SentenceDetectorME; import opennlp.tools.sentdetect.SentenceModel; import opennlp.tools.tokenize.Tokenizer; import opennlp.tools.tokenize.TokenizerME; import opennlp.tools.tokenize.TokenizerModel; +import org.apache.uima.UIMAException; +import org.apache.uima.analysis_engine.AnalysisEngine; +import org.apache.uima.analysis_engine.AnalysisEngineDescription; +import org.apache.uima.fit.testing.factory.TokenBuilder; +import org.apache.uima.fit.util.JCasUtil; +import org.apache.uima.jcas.JCas; +import org.apache.uima.resource.ResourceInitializationException; +import org.springframework.core.io.ClassPathResource; + +import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngine; +import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; + public class NLPAnalyser { @@ -67,6 +63,7 @@ public class NLPAnalyser { public NLPAnalyser() { super(); try { + System.out.println("Loading"); wordnet = ResourceFactory.getInstance().get("wordnet", "en"); wordnet.setIsCaseSensitive(false); } catch (ResourceLoaderException e) { @@ -75,15 +72,40 @@ public NLPAnalyser() { } } - /** - * The approach of dependency detection - * @param requirement + * Semantic similarity engine (DKPRO & WordNet) + * @param term1 + * @param term2 * @return - * @throws IOException - * @throws ResourceInitializationException - * @throws UIMAException + * @throws SimilarityException + * @throws LexicalSemanticResourceException */ + public double semanticSimilarity(String term1, String term2) + throws SimilarityException, LexicalSemanticResourceException { + + if (comparatorWN == null) + comparatorWN = new WuPalmerComparator(wordnet, wordnet.getRoot()); + return comparatorWN.getSimilarity(term1, term2); + + } + + public Map prepareRequirements(Map requirements) throws InterruptedException, ExecutionException, IOException { + List recs=new ArrayList<>(); + for (String s:requirements.keySet()) { + recs.add(new com.gessi.dependency_detection.domain.Requirement(s,requirements.get(s))); + } + Map keywords; + if (requirements.keySet().size()>100) { + TFIDFKeywordExtractor extractor=new TFIDFKeywordExtractor(); + keywords=extractor.computeTFIDF(recs); + } + else { + RAKEKeywordExtractor extractor=new RAKEKeywordExtractor(); + keywords=extractor.computeRake(recs); + } + return keywords; + } + public List requirementAnalysis(String requirement) throws IOException, ResourceInitializationException, UIMAException { @@ -126,6 +148,31 @@ public List requirementAnalysis(String requirement) return synResult; } + /** + * Debug + * Utility to read a file + * @param path + * @return + */ + public List readFile(String path) { + ArrayList fileLines = new ArrayList<>(); + + try(FileReader fr = new FileReader(path); + BufferedReader br = new BufferedReader(fr)) { + + String sCurrentLine; + + while ((sCurrentLine = br.readLine()) != null) { + fileLines.add(sCurrentLine); + } + + } catch (IOException e) { + Control.getInstance().showErrorMessage(e.getMessage()); + } + return fileLines; + } +//--------------------------------------------------------------RULE BASED + /** * Noisy text cleaning * Rule-based method @@ -159,7 +206,7 @@ private String clearSentence(String sentence) { // split the phrase correctly sentence = sentence.replaceAll("\\.(\\s)", " . "); sentence = sentence.replaceAll("\\s+", " "); - + // Check the endpoint of the sentence if (sentence.length() > 1) { if (sentence.substring(sentence.length() - 1).equals(";") @@ -173,7 +220,7 @@ private String clearSentence(String sentence) { } return sentence; } - + /** * Tokenization (OpenNLP) * @param requirmenet @@ -245,45 +292,6 @@ public static JCas runParser(AnalysisEngine aEngine, String aLanguage, String aT return jcas; } - - /** - * Lemmatization engine (clearNLP) - * @param term - * @return - * @throws UIMAException - */ - public String lemmatization(String term) throws UIMAException { - if (lemmaEngine == null && tagger == null && lemma == null) { - - tagger = createEngineDescription(ClearNlpPosTagger.class); - lemma = createEngineDescription(ClearNlpLemmatizer.class); - - lemmaEngine = createEngine(createEngineDescription(tagger, lemma)); - } - JCas jcas = runParser(lemmaEngine, "en", term); - Collection lemmas = JCasUtil.select(jcas, Lemma.class); - String ret = ""; - String[] terms = term.split(" "); - int i = 0; - if (!lemmas.isEmpty()) { - for (Lemma l : lemmas) { - if (!l.getValue().matches("\\d+")) { - if (!ret.equals("")) - ret = ret.concat(" " + l.getValue()); - else - ret = l.getValue(); - } else { - if (!ret.equals("")) - ret = ret.concat(" " + terms[i]); - else - ret = terms[i]; - } - i++; - } - } - return ret; - } - /** * Dependency parser engine (clearNLP) * This function generates a dependency tree from the dependency parser results. @@ -314,7 +322,6 @@ public Node dependencyParser(String aText) throws ResourceInitializationExceptio } return root; } - /** * Update the tree information * @param tree @@ -336,7 +343,6 @@ private Node fillTreeLinks(ArrayList tree) { } return root; } - /** * Find the parent of the node from the dependncy parser results * @param tree @@ -360,43 +366,47 @@ private int findParent(ArrayList tree, int parentId, int idx, boolean next } /** - * Semantic similarity engine (DKPRO & WordNet) - * @param term1 - * @param term2 + * Lemmatization engine (clearNLP) + * @param term * @return - * @throws SimilarityException - * @throws LexicalSemanticResourceException + * @throws UIMAException */ - public double semanticSimilarity(String term1, String term2) - throws SimilarityException, LexicalSemanticResourceException { + public String lemmatization(String term) throws UIMAException { + if (lemmaEngine == null && tagger == null && lemma == null) { - if (comparatorWN == null) - comparatorWN = new WuPalmerComparator(wordnet, wordnet.getRoot()); - return comparatorWN.getSimilarity(term1, term2); + tagger = createEngineDescription(ClearNlpPosTagger.class); + lemma = createEngineDescription(ClearNlpLemmatizer.class); + lemmaEngine = createEngine(createEngineDescription(tagger, lemma)); + } + JCas jcas = runParser(lemmaEngine, "en", term); + Collection lemmas = JCasUtil.select(jcas, Lemma.class); + String ret = ""; + String[] terms = term.split(" "); + int i = 0; + if (!lemmas.isEmpty()) { + for (Lemma l : lemmas) { + if (!l.getValue().matches("\\d+")) { + if (!ret.equals("")) + ret = ret.concat(" " + l.getValue()); + else + ret = l.getValue(); + } else { + if (!ret.equals("")) + ret = ret.concat(" " + terms[i]); + else + ret = terms[i]; + } + i++; + } + } + return ret; } - /** - * Debug - * Utility to read a file - * @param path - * @return - */ - public List readFile(String path) { - ArrayList fileLines = new ArrayList<>(); - try(FileReader fr = new FileReader(path); - BufferedReader br = new BufferedReader(fr)) { - String sCurrentLine; - while ((sCurrentLine = br.readLine()) != null) { - fileLines.add(sCurrentLine); - } - } catch (IOException e) { - Control.getInstance().showErrorMessage(e.getMessage()); - } - return fileLines; - } + + } \ No newline at end of file diff --git a/src/main/java/com/gessi/dependency_detection/functionalities/OntologyHandler.java b/src/main/java/com/gessi/dependency_detection/functionalities/OntologyHandler.java index 411bf4b..cfd0f39 100644 --- a/src/main/java/com/gessi/dependency_detection/functionalities/OntologyHandler.java +++ b/src/main/java/com/gessi/dependency_detection/functionalities/OntologyHandler.java @@ -3,12 +3,14 @@ import java.io.IOException; import java.util.*; +import com.gessi.dependency_detection.WordEmbedding; +import com.gessi.dependency_detection.components.Node; +import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.S; import org.apache.log4j.varia.NullAppender; import org.apache.uima.UIMAException; import com.gessi.dependency_detection.components.Dependency; import com.gessi.dependency_detection.components.DependencyType; -import com.gessi.dependency_detection.components.Node; import com.gessi.dependency_detection.components.Status; import com.hp.hpl.jena.ontology.DatatypeProperty; import com.hp.hpl.jena.ontology.Individual; @@ -57,14 +59,15 @@ public void loadOnt(String source, String path) throws IOException { /** * Analyse ontology classes and extract its infromation (terms, lemmas) * - * @param analizer * @throws IOException * @throws UIMAException + * @return */ - public void searchClasses(NLPAnalyser analizer) throws IOException, UIMAException { + public int searchClassesTfIdfBased() throws IOException, UIMAException { ontClasses = new ArrayList<>(); classesWords = new ArrayList<>(); classesLemmas = new ArrayList<>(); + int max=1; ExtendedIterator rootClasses = this.model.listClasses(); while (rootClasses.hasNext()) { OntClass thisClass = (OntClass) rootClasses.next(); @@ -82,10 +85,10 @@ public void searchClasses(NLPAnalyser analizer) throws IOException, UIMAExceptio ontTerm = words[i]; } } - String[] lemmas = extractLemmas(ontTerm, analizer); - + String[] lemmas = extractLemmas(ontTerm); ontClasses.add(thisClass); classesWords.add(words); + if (words.length>max) max=words.length; classesLemmas.add(lemmas); for (int i = 0; i < lemmas.length; i++) { synonyms.put(lemmas[i], new ArrayList<>()); @@ -93,204 +96,151 @@ public void searchClasses(NLPAnalyser analizer) throws IOException, UIMAExceptio } } } + return max; } /** - * Check the similarity between two terms - * - * @param reqTerm - * @param ontLemma + * Analyse ontology classes and extract its infromation (terms, lemmas) + * * @param analizer - * @param thr - * @return - * @throws SimilarityException - * @throws LexicalSemanticResourceException + * @throws IOException + * @throws UIMAException */ - private boolean isSynonym(String reqTerm, String ontLemma, NLPAnalyser analizer, double thr) - throws SimilarityException, LexicalSemanticResourceException { - if (!ontLemma.matches("\\d+|\\W+")) { - if (!synonyms.get(ontLemma).contains(reqTerm) && !noSynonyms.get(ontLemma).contains(reqTerm)) { - if (analizer.semanticSimilarity(reqTerm, ontLemma) >= thr) { - synonyms.get(ontLemma).add(reqTerm); + public void searchClasses(NLPAnalyser analizer) throws IOException, UIMAException { + ontClasses = new ArrayList<>(); + classesWords = new ArrayList<>(); + classesLemmas = new ArrayList<>(); + ExtendedIterator rootClasses = this.model.listClasses(); + while (rootClasses.hasNext()) { + OntClass thisClass = (OntClass) rootClasses.next(); - return true; - } else { - noSynonyms.get(ontLemma).add(reqTerm); + if (thisClass.getLocalName() != null) { + String ontTerm = ""; + String[] words = thisClass.getLocalName() + .split("_|\\s|(?()); + noSynonyms.put(lemmas[i], new ArrayList<>()); } - } else if (synonyms.get(ontLemma).contains(reqTerm)) { - return true; - } else if (noSynonyms.get(ontLemma).contains(reqTerm)) { - return false; } } - return false; } - /** - * Check if the req. term match with the term of the ontology - * - * @param term - * @param lemma - * @param ontWord - * @param ontLemma - * @return - * @throws SimilarityException - * @throws LexicalSemanticResourceException - */ - private boolean isSameTerm(String term, String lemma, String ontWord, String ontLemma) - throws SimilarityException, LexicalSemanticResourceException { - if (term.equalsIgnoreCase(ontWord)) - return true; - if (lemma.equals(ontWord)) - return true; - if (lemma.equals(ontLemma)) - return true; - if (term.equalsIgnoreCase(ontLemma)) - return true; + private String[] extractLemmas(String ontTerm) throws IOException { + TextPreprocessing textPreprocessing=new TextPreprocessing(); + String l=textPreprocessing.text_preprocess(ontTerm); + RAKEKeywordExtractor rake=new RAKEKeywordExtractor(); + List resAnalysis=rake.RAKEanalyzeNoStopword(l); + String[] res=new String[resAnalysis.size()]; + return resAnalysis.toArray(res); + } + private String[] extractLemmas(String words, NLPAnalyser analizer) throws IOException, UIMAException { + String ontLemma = analizer.lemmatization(words); + return ontLemma.split(" "); + } + - if (term.toLowerCase().matches(ontWord + "s|es")) - return true; - if (lemma.matches(ontWord + "s|es")) - return true; - if (lemma.matches(ontLemma + "s|es")) - return true; - if (term.toLowerCase().matches(ontLemma + "s|es")) - return true; - return false; - } /** - * check if a ordered set of words is the same of the set of words of the - * ontology + * Find if the set of words contains a correct n-gram that match with the + * ontology. * - * @param ngramTerm - * @param ngramLemma - * @param words + * @param node * @param lemmas - * @param analizer * @param syny - * @param thr * @return * @throws SimilarityException * @throws LexicalSemanticResourceException */ - private boolean isSameNgram(Stack ngramTerm, Stack ngramLemma, String[] words, String[] lemmas, - NLPAnalyser analizer, boolean syny, double thr) - throws SimilarityException, LexicalSemanticResourceException { - boolean find = false; - ArrayList idxOntLemmaAnalized = new ArrayList<>(); - ArrayList idxReqLemmaAnalized = new ArrayList<>(); - for (int i = 0; i < ngramTerm.size(); i++) { - if (!find && i > 0) { - return false; - } - find = false; - int j = 0; - while (j < words.length && !find) { - if (!idxOntLemmaAnalized.contains(j) - && isSameTerm(ngramTerm.get(i), ngramLemma.get(i), words[j], lemmas[j])) { - find = true; - idxReqLemmaAnalized.add(i); - idxOntLemmaAnalized.add(j); - } - j++; - } + private boolean extractNGram(String node, String[] lemmas, boolean syny,double thr,WordEmbedding wordEmbedding) throws SimilarityException, LexicalSemanticResourceException, IOException { + String[] lemmasNode = node.split(" "); + Set nodeSet=new HashSet(Arrays.asList(lemmasNode)); + Set lemmaSet=new HashSet(Arrays.asList(lemmas)); + if (syny) { + return isSynonym(nodeSet,lemmaSet,thr,wordEmbedding); } - - // of it is not detected, check the synonymy - if (!find && syny) { - - for (int i = 0; i < ngramLemma.size(); i++) { - if (!idxReqLemmaAnalized.contains(i)) { - if (!find && i > 0) { - return false; - } - find = false; - int j = 0; - while (j < lemmas.length && !find) { - if (!idxOntLemmaAnalized.contains(j) - && isSynonym(ngramLemma.get(i), lemmas[j], analizer, thr)) { - find = true; - idxOntLemmaAnalized.add(j); - } - j++; - } - } else find = true; - } + else { + return nodeSet.containsAll(lemmaSet); } - return find; } - /** - * Find all the combinations of the n-gram to check if the req. concept matches - * with the ont. concept - * - * @param idx - * @param level - * @param n - * @param termsNode - * @param lemmasNode - * @param ngramTerm - * @param ngramLemma - * @param words - * @param lemmas - * @param analizer - * @param syny - * @param thr - * @return - * @throws SimilarityException - * @throws LexicalSemanticResourceException - */ - private boolean findPotentialNgram(int idx, int level, int n, String[] termsNode, String[] lemmasNode, - Stack ngramTerm, Stack ngramLemma, String[] words, String[] lemmas, NLPAnalyser analizer, - boolean syny, double thr) throws SimilarityException, LexicalSemanticResourceException { - boolean find = false; - for (int j = idx; j < termsNode.length && !find; j++) { - ngramTerm.push(termsNode[j]); - ngramLemma.push(lemmasNode[j]); - if (level < n) { - find = findPotentialNgram(j + 1, level + 1, n, termsNode, lemmasNode, ngramTerm, ngramLemma, words, - lemmas, analizer, syny, thr); + private boolean isSynonym(Set requirementLemmas, Set ontologyLemmas,double thr,WordEmbedding wordEmbedding) throws IOException { + boolean isSynonym=true; + for (String s: ontologyLemmas) { + boolean synonymExists=false; + for (String l:requirementLemmas) { + if (wordEmbedding.computeSimilarity(s,l)>=thr) { + synonymExists=true; + break; + } } - if (level == n && isSameNgram(ngramTerm, ngramLemma, words, lemmas, analizer, syny, thr)) return true; - ngramTerm.pop(); - ngramLemma.pop(); + isSynonym=isSynonym&&synonymExists; + if (!isSynonym) return false; } - return find; + return true; } /** - * Find if the set of words contains a correct n-gram that match with the - * ontology. + * Analyze the potential term candidates extracted from the requirements (n-gram + * concepts), and store the requirement within the related ontology class if + * they matches with a concept of the ontology. * - * @param node - * @param words - * @param lemmas - * @param analizer + * @param keywords + * @param reqId + * @param requirement * @param syny - * @param thr - * @return + * @throws IOException * @throws SimilarityException * @throws LexicalSemanticResourceException */ - private boolean extractNGram(Node node, String[] words, String[] lemmas, NLPAnalyser analizer, boolean syny, - double thr) throws SimilarityException, LexicalSemanticResourceException { - String[] termsNode = node.getTerm().split(" "); - String[] lemmasNode = node.getLemma().split(" "); - int n = words.length; - Stack ngramTerm = new Stack<>(); - Stack ngramLemma = new Stack<>(); + public void matching(String keywords, String reqId, String requirement, boolean syny,double thr,WordEmbedding wordEmbedding) throws IOException, SimilarityException, LexicalSemanticResourceException { + ArrayList classes = new ArrayList<>(); + String[] lemmas; + for (int j = 0; j < ontClasses.size(); j++) { + lemmas = classesLemmas.get(j); + if (keywords.split(" ").length >= lemmas.length && extractNGram(keywords, lemmas, syny,thr,wordEmbedding)) { + System.out.println("Requirement " + reqId + " contains class " + String.join(" ", lemmas)); + //System.out.println("REQUIREMENT KEYWORDS: "+keywords); + //System.out.println("ONTOLOGY NAME: "+lemmas.toString()); + + classes.add(ontClasses.get(j)); + } + } - return findPotentialNgram(0, 1, n, termsNode, lemmasNode, ngramTerm, ngramLemma, words, lemmas, analizer, syny, thr); + // Requirement instantiation within the ontology + for (OntClass cls : classes) { + //System.out.println("A MATCH WAS MADE"); + Individual individual = this.model.createIndividual(this.source + ":" + reqId + "_" + cls.getLocalName(), + cls); + DatatypeProperty req = this.model.getDatatypeProperty(this.source + "#requirement"); + individual.setPropertyValue(req, this.model.createTypedLiteral(requirement)); + DatatypeProperty id = this.model.getDatatypeProperty(this.source + "#id"); + individual.setPropertyValue(id, this.model.createTypedLiteral(reqId)); + DatatypeProperty className = this.model.getDatatypeProperty(this.source + "#class"); + individual.setPropertyValue(className, this.model.createTypedLiteral(cls.getLocalName())); + } } - /** * Analyze the potential term candidates extracted from the requirements (n-gram * concepts), and store the requirement within the related ontology class if * they matches with a concept of the ontology. - * + * * @param topNodes * @param reqId * @param requirement @@ -301,7 +251,8 @@ private boolean extractNGram(Node node, String[] words, String[] lemmas, NLPAnal * @throws SimilarityException * @throws LexicalSemanticResourceException */ - public void matching(List topNodes, String reqId, String requirement, NLPAnalyser analizer, boolean syny, + + public void matchingRuleBased(List topNodes, String reqId, String requirement, NLPAnalyser analizer, boolean syny, double thr) throws IOException, SimilarityException, LexicalSemanticResourceException { ArrayList classes = new ArrayList<>(); String[] words; @@ -310,7 +261,7 @@ public void matching(List topNodes, String reqId, String requirement, NLPA for (int j = 0; j < ontClasses.size(); j++) { words = classesWords.get(j); lemmas = classesLemmas.get(j); - if (topNodes.get(i).getTerm().split(" ").length >= words.length && extractNGram(topNodes.get(i), words, lemmas, analizer, syny, thr)) classes.add(ontClasses.get(j)); + if (topNodes.get(i).getTerm().split(" ").length >= words.length && extractNGramRuleBased(topNodes.get(i), words, lemmas, analizer, syny, thr)) classes.add(ontClasses.get(j)); } } @@ -327,21 +278,9 @@ public void matching(List topNodes, String reqId, String requirement, NLPA } } - /** - * Extract lemmas from ontology classes - * @param words - * @param analizer - * @return - * @throws IOException - * @throws UIMAException - */ - private String[] extractLemmas(String words, NLPAnalyser analizer) throws IOException, UIMAException { - String ontLemma = analizer.lemmatization(words); - return ontLemma.split(" "); - } /** - * Analyze the ontology and extract dependncies + * Analyze the ontology and extract dependencies * @return */ public List ontConflictDetection() { @@ -368,7 +307,7 @@ public List ontConflictDetection() { if (!f.equals(t)) { Dependency newDep = new Dependency(f, t, Status.PROPOSED, DependencyType.valueOf((String) dep.get(0).toString().toUpperCase())); - dependencies.add(newDep); + if (!dependencies.contains(newDep)) dependencies.add(newDep); } } } @@ -427,5 +366,181 @@ private List displayRestriction(OntProperty property, Resource constrain result.add(constraint); return result; } + private boolean extractNGramRuleBased(Node node, String[] words, String[] lemmas, NLPAnalyser analizer, boolean syny, + double thr) throws SimilarityException, LexicalSemanticResourceException { + String[] termsNode = node.getTerm().split(" "); + String[] lemmasNode = node.getLemma().split(" "); + int n = words.length; + Stack ngramTerm = new Stack<>(); + Stack ngramLemma = new Stack<>(); + + return findPotentialNgram(0, 1, n, termsNode, lemmasNode, ngramTerm, ngramLemma, words, lemmas, analizer, syny, thr); + } + /** + * Find all the combinations of the n-gram to check if the req. concept matches + * with the ont. concept + * + * @param idx + * @param level + * @param n + * @param termsNode + * @param lemmasNode + * @param ngramTerm + * @param ngramLemma + * @param words + * @param lemmas + * @param analizer + * @param syny + * @param thr + * @return + * @throws SimilarityException + * @throws LexicalSemanticResourceException + */ + private boolean findPotentialNgram(int idx, int level, int n, String[] termsNode, String[] lemmasNode, + Stack ngramTerm, Stack ngramLemma, String[] words, String[] lemmas, NLPAnalyser analizer, + boolean syny, double thr) throws SimilarityException, LexicalSemanticResourceException { + boolean find = false; + for (int j = idx; j < termsNode.length && !find; j++) { + ngramTerm.push(termsNode[j]); + ngramLemma.push(lemmasNode[j]); + if (level < n) { + find = findPotentialNgram(j + 1, level + 1, n, termsNode, lemmasNode, ngramTerm, ngramLemma, words, + lemmas, analizer, syny, thr); + } + if (level == n && isSameNgram(ngramTerm, ngramLemma, words, lemmas, analizer, syny, thr)) return true; + ngramTerm.pop(); + ngramLemma.pop(); + } + return find; + } + + /** + * check if a ordered set of words is the same of the set of words of the + * ontology + * + * @param ngramTerm + * @param ngramLemma + * @param words + * @param lemmas + * @param analizer + * @param syny + * @param thr + * @return + * @throws SimilarityException + * @throws LexicalSemanticResourceException + */ + private boolean isSameNgram(Stack ngramTerm, Stack ngramLemma, String[] words, String[] lemmas, + NLPAnalyser analizer, boolean syny, double thr) + throws SimilarityException, LexicalSemanticResourceException { + boolean find = false; + ArrayList idxOntLemmaAnalized = new ArrayList<>(); + ArrayList idxReqLemmaAnalized = new ArrayList<>(); + for (int i = 0; i < ngramTerm.size(); i++) { + if (!find && i > 0) { + return false; + } + find = false; + int j = 0; + while (j < words.length && !find) { + if (!idxOntLemmaAnalized.contains(j) + && isSameTerm(ngramTerm.get(i), ngramLemma.get(i), words[j], lemmas[j])) { + find = true; + idxReqLemmaAnalized.add(i); + idxOntLemmaAnalized.add(j); + } + j++; + } + } + + // of it is not detected, check the synonymy + if (!find && syny) { + + for (int i = 0; i < ngramLemma.size(); i++) { + if (!idxReqLemmaAnalized.contains(i)) { + if (!find && i > 0) { + return false; + } + find = false; + int j = 0; + while (j < lemmas.length && !find) { + if (!idxOntLemmaAnalized.contains(j) + && isSynonymRuleBased(ngramLemma.get(i), lemmas[j], analizer, thr)) { + find = true; + idxOntLemmaAnalized.add(j); + } + j++; + } + } else find = true; + } + } + return find; + } + + /** + * Check if the req. term match with the term of the ontology + * + * @param term + * @param lemma + * @param ontWord + * @param ontLemma + * @return + * @throws SimilarityException + * @throws LexicalSemanticResourceException + */ + private boolean isSameTerm(String term, String lemma, String ontWord, String ontLemma) + throws SimilarityException, LexicalSemanticResourceException { + + if (term.equalsIgnoreCase(ontWord)) + return true; + if (lemma.equals(ontWord)) + return true; + if (lemma.equals(ontLemma)) + return true; + if (term.equalsIgnoreCase(ontLemma)) + return true; + + if (term.toLowerCase().matches(ontWord + "s|es")) + return true; + if (lemma.matches(ontWord + "s|es")) + return true; + if (lemma.matches(ontLemma + "s|es")) + return true; + if (term.toLowerCase().matches(ontLemma + "s|es")) + return true; + + return false; + } + + + /** + * Check the similarity between two terms + * + * @param reqTerm + * @param ontLemma + * @param analizer + * @param thr + * @return + * @throws SimilarityException + * @throws LexicalSemanticResourceException + */ + private boolean isSynonymRuleBased(String reqTerm, String ontLemma, NLPAnalyser analizer, double thr) + throws SimilarityException, LexicalSemanticResourceException { + if (!ontLemma.matches("\\d+|\\W+")) { + if (!synonyms.get(ontLemma).contains(reqTerm) && !noSynonyms.get(ontLemma).contains(reqTerm)) { + if (analizer.semanticSimilarity(reqTerm, ontLemma) >= thr) { + synonyms.get(ontLemma).add(reqTerm); + + return true; + } else { + noSynonyms.get(ontLemma).add(reqTerm); + } + } else if (synonyms.get(ontLemma).contains(reqTerm)) { + return true; + } else if (noSynonyms.get(ontLemma).contains(reqTerm)) { + return false; + } + } + return false; + } } diff --git a/src/main/java/com/gessi/dependency_detection/functionalities/RAKEKeywordExtractor.java b/src/main/java/com/gessi/dependency_detection/functionalities/RAKEKeywordExtractor.java new file mode 100644 index 0000000..62540ea --- /dev/null +++ b/src/main/java/com/gessi/dependency_detection/functionalities/RAKEKeywordExtractor.java @@ -0,0 +1,119 @@ +package com.gessi.dependency_detection.functionalities; + +import com.gessi.dependency_detection.domain.Requirement; +import com.linguistic.rake.Rake; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.custom.CustomAnalyzer; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; + +import javax.swing.plaf.basic.BasicInternalFrameTitlePane; +import java.io.IOException; +import java.io.StringReader; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +public class RAKEKeywordExtractor { + private Double cutoff = 3.0; + private TextPreprocessing preprocess = new TextPreprocessing(); + + /** + * Passes the text through Lucene's token analyzer + * @param text Text to clean + * @param analyzer Analyzer to use + * @return Returns a cleaned list of strings + */ + public static List getAnalyzedStrings(String text, Analyzer analyzer) throws IOException { + List result=new ArrayList<>(); + TokenStream tokenStream = analyzer.tokenStream(null, new StringReader(text)); + CharTermAttribute attr = tokenStream.addAttribute(CharTermAttribute.class); + tokenStream.reset(); + while (tokenStream.incrementToken()) { + result.add(attr.toString()); + } + return result; + } + + + /** + * Extracts keywords using RAKE algorithm from a given corpus + * @param corpus Corpus to be used for RAKE + * @return Returns a list of maps, compromised of + */ + public List> extractKeywords(List corpus) throws IOException { + List> res = new ArrayList<>(); + Rake rake = new Rake(); + for (Requirement s : corpus) { + String text = ""; + for (String k : RAKEanalyzeNoStopword(s.getDescription())) { + text = text + " " + k; + } + Map aux = rake.getKeywordsFromText(text); + String sum = ""; + for (String j : aux.keySet()) { + Double val = aux.get(j); + if (val >= cutoff) sum = sum + " " + j; + } + List result = RAKEanalyze(sum); + Map helper = new HashMap<>(); + for (String i : result) { + helper.put(i, aux.get(i)); + } + res.add(helper); + } + return res; + } + + /** + * Extracts skills using RAKE algorithm + * @param corpus Requirement corpus to be analyzed + * @return Returns a map of maps, compromised by > + */ + public Map computeRake(List corpus) throws IOException { + List> res = extractKeywords(corpus); + Map processedRequirements=new HashMap<>(); + int counter=0; + for (Requirement r: corpus) { + String newText=""; + for (String s:res.get(counter).keySet()) { + newText=newText+" "+s; + } + processedRequirements.put(r.getId(),newText); + ++counter; + } + return processedRequirements; + } + + /** + * Cleans text + * @param text Text to clean + * @return Returns a cleaned list of strings + */ + List RAKEanalyze(String text) throws IOException { + text = preprocess.text_preprocess(text); + Analyzer analyzer = CustomAnalyzer.builder() + .withTokenizer("standard") + .addTokenFilter("lowercase") + .addTokenFilter("stop") + .addTokenFilter("kstem") + .build(); + return getAnalyzedStrings(text, analyzer); + } + + /** + * Cleans text for RAKE algorithm to use + * @param text Text to clean + * @return Returns a cleaned list of strings + */ + public List RAKEanalyzeNoStopword(String text) throws IOException { + Analyzer analyzer = CustomAnalyzer.builder() + .withTokenizer("standard") + .addTokenFilter("lowercase") + .addTokenFilter("kstem") + .build(); + return getAnalyzedStrings(text, analyzer); + } + +} diff --git a/src/main/java/com/gessi/dependency_detection/functionalities/TFIDFKeywordExtractor.java b/src/main/java/com/gessi/dependency_detection/functionalities/TFIDFKeywordExtractor.java new file mode 100644 index 0000000..64e0f24 --- /dev/null +++ b/src/main/java/com/gessi/dependency_detection/functionalities/TFIDFKeywordExtractor.java @@ -0,0 +1,170 @@ +package com.gessi.dependency_detection.functionalities; + +import com.gessi.dependency_detection.domain.Requirement; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.custom.CustomAnalyzer; + +import java.io.IOException; +import java.util.*; +import java.util.concurrent.ExecutionException; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + + +public class TFIDFKeywordExtractor { + + private Double cutoffParameter=4.0; //This can be set to different values for different selectivity (more or less keywords) + private HashMap corpusFrequency = new HashMap<>(); + private TextPreprocessing text_preprocess = new TextPreprocessing(); + + + /** + * Computes the term frequency of each word in the text, and updates the Idf, + * @param doc List of strings to analyze + * @return Returns a map identified by + */ + private Map tf(List doc) { + Map frequency = new HashMap<>(); + for (String s : doc) { + if (frequency.containsKey(s)) frequency.put(s, frequency.get(s) + 1); + else { + frequency.put(s, 1); + if (corpusFrequency.containsKey(s)) corpusFrequency.put(s, corpusFrequency.get(s) + 1); + else corpusFrequency.put(s, 1); + } + + } + return frequency; + } + + private double idf(Integer size, Integer frequency) { + return StrictMath.log(size.doubleValue() / frequency.doubleValue() + 1.0); + } + + /** + * Preprocesses the text + * @param text Text to preprocess + * @param analyzer Analyzer to use + * @return Returns a list of cleaned strings + */ + private List analyze(String text, Analyzer analyzer,String reqId) throws IOException { + text = clean_text(text,reqId); + return RAKEKeywordExtractor.getAnalyzedStrings(text, analyzer); + } + + /** + * Preprocesses the text + * @param text Text to preprocess + * @return Returns a list of cleaned strings + */ + private List englishAnalyze(String text,String reqId) throws IOException { + Analyzer analyzer = CustomAnalyzer.builder() + .withTokenizer("standard") + .addTokenFilter("lowercase") + .addTokenFilter("stop") + .addTokenFilter("kstem") + .build(); + return analyze(text, analyzer, reqId); + } + + /** + * Computes Tf-Idf on a corpus of requirements + * @param corpus Corpus to be used for tf-idf + * @return Returns a map of maps, compromised of > + */ + public Map computeTFIDF(List corpus) throws IOException, ExecutionException, InterruptedException { + List> trueDocs = new ArrayList<>(); + for (Requirement r : corpus) { + List s = englishAnalyze(r.getDescription(),r.getId()); + trueDocs.add(s); + } + List> res = tfIdf(trueDocs); + Map processedRequirements=new HashMap<>(); + int counter=0; + for (Requirement r: corpus) { + String newText=""; + for (String s:res.get(counter).keySet()) { + newText=newText+" "+s; + } + processedRequirements.put(r.getId(),newText); + } + return processedRequirements; + + } + /** + * Computes Tf-Idf on a list of lists + * @param docs Corpus to be used for Tf-Idf + * @return Returns a list of maps, compromised by + */ + private List> tfIdf(List> docs) { + List> tfidfComputed = new ArrayList<>(); + List> wordBag = new ArrayList<>(); + for (List doc : docs) { + wordBag.add(tf(doc)); + } + int counter = 0; + for (List doc : docs) { + HashMap aux = new HashMap<>(); + for (String s : new TreeSet<>(doc)) { + Double idf = idf(docs.size(), corpusFrequency.get(s)); + Integer tf = wordBag.get(counter).get(s); + Double tfidf = idf * tf; + if (tfidf >= cutoffParameter && s.length() > 1) { + aux.put(s, tfidf); + } + } + ++counter; + tfidfComputed.add(aux); + } + return tfidfComputed; + + } + + /** + * Preprocesses the text and adds two special rules to help keyword extraction, these are that any word entirely in capital letters is to be made a keyword, + * and that any word between [] is to be made a keyword + * @param text Text to preprocess + * @return Returns a list of cleaned strings + */ + private String clean_text(String text,String reqId) throws IOException { + text = text_preprocess.text_preprocess(text); + String result = ""; + if (text.contains("[")) { + Pattern p = Pattern.compile("\\[(.*?)\\]"); + Matcher m = p.matcher(text); + while (m.find()) { + text = text + " " + m.group().toUpperCase(); + } + } + for (String a : text.split(" ")) { + String helper = ""; + if (a.toUpperCase().equals(a)) { + for (int i = 0; i < 10; ++i) { + helper = helper.concat(" " + a); + } + a = helper; + } + result = result.concat(" " + a); + } + return result; + } + + + public HashMap getCorpusFrequency() { + return corpusFrequency; + } + + public void setCorpusFrequency(HashMap corpusFrequency) { + this.corpusFrequency = corpusFrequency; + } + + public Double getCutoffParameter() { + return cutoffParameter; + } + + public void setCutoffParameter(Double cutoffParameter) { + this.cutoffParameter = cutoffParameter; + } + + +} diff --git a/src/main/java/com/gessi/dependency_detection/functionalities/TextPreprocessing.java b/src/main/java/com/gessi/dependency_detection/functionalities/TextPreprocessing.java new file mode 100644 index 0000000..7410fc2 --- /dev/null +++ b/src/main/java/com/gessi/dependency_detection/functionalities/TextPreprocessing.java @@ -0,0 +1,62 @@ +package com.gessi.dependency_detection.functionalities; + +import org.springframework.stereotype.Service; + +import java.io.BufferedReader; +import java.io.FileReader; +import java.io.IOException; +import java.util.HashSet; +import java.util.Set; + +@Service +public class TextPreprocessing { + + Set exclusions = null; + + /** + * Preprocesses the text by removing stopwords and characters that hold no semantic meaning, or would worsen the semantic analysis + * @param text The string to preprocess + * @return The preprocessed text + */ + public String text_preprocess(String text) throws IOException { + String trueRes = ""; + if (text != null) { + text = text.replaceAll("(\\{.*?})", " code "); + text = text.replaceAll("[$,;\\\"/:|!?=()><_{}'+%[0-9]]", " "); + text = text.replaceAll("] \\[", "]["); + + if (exclusions == null) { + BufferedReader reader = new BufferedReader(new FileReader("src/main/resources/ExcludedWords.txt")); + String word = null; + exclusions = new HashSet<>(); + + while ((word = reader.readLine()) != null) { + exclusions.add(word); + } + reader.close(); + } + for (String l : text.split(" ")) { + if (!(l.toLowerCase().equals("null") && !l.equals("null") && !l.equals("Null")) && !l.toUpperCase().equals(l)) l = l.toLowerCase(); + if (l != null && !exclusions.contains(l) && l.length() > 1) { + String[] aux=l.split("\\."); + if (!(aux.length>1 && (aux[1]==null|| aux[0]==null || aux[0].equals("")&&aux[1].equals("") || aux[0].equals(" ")|| aux[1].equals(" ")))) { + if (aux.length > 1) { + String repeatingWord = aux[0]; + l = aux[0] + " " + aux[0]; + for (int i = 1; i < aux.length; ++i) { + repeatingWord = repeatingWord + "." + aux[i]; + l = l + "." + aux[i]; + if (i != (aux.length - 1)) l = l + " " + repeatingWord; + } + } + } + else l=l.replace(".",""); + trueRes = trueRes.concat(l + " "); + } + } + } + return trueRes; + + } + +} diff --git a/src/main/java/com/gessi/dependency_detection/service/DependencyService.java b/src/main/java/com/gessi/dependency_detection/service/DependencyService.java index 4ffc1be..5285c71 100644 --- a/src/main/java/com/gessi/dependency_detection/service/DependencyService.java +++ b/src/main/java/com/gessi/dependency_detection/service/DependencyService.java @@ -5,10 +5,15 @@ import java.nio.file.Path; import java.nio.file.Paths; import java.nio.file.StandardCopyOption; +import java.util.ArrayList; import java.util.List; import java.util.Map; import java.util.Map.Entry; +import java.util.concurrent.ExecutionException; +import com.gessi.dependency_detection.WordEmbedding; +import com.gessi.dependency_detection.domain.KeywordTool; +import de.tudarmstadt.ukp.dkpro.lexsemresource.exception.ResourceLoaderException; import org.apache.uima.UIMAException; import org.apache.uima.resource.ResourceInitializationException; import org.springframework.beans.factory.annotation.Autowired; @@ -47,7 +52,7 @@ public class DependencyService { * @throws IOException */ @Autowired - public DependencyService(StorageProperties properties) throws IOException { + public DependencyService(StorageProperties properties) throws IOException, ResourceLoaderException { this.rootLocation = Paths.get(properties.getRootLocation()); this.ontLocation = Paths.get(properties.getOntLocation()); this.docLocation = Paths.get(properties.getDocLocation()); @@ -181,31 +186,49 @@ public void loadOntology() throws IOException { * @throws IOException * @throws ResourceInitializationException * @throws UIMAException - * @throws dkpro.similarity.algorithms.api.SimilarityException + * @throws dkpro.similarity.algorithms.api.SimilarityException * @throws LexicalSemanticResourceException */ - public ObjectNode conflictDependencyDetection(String projectId, boolean syny, double thr) + public ObjectNode conflictDependencyDetection(String projectId, boolean syny, double thr, KeywordTool keywordTool) throws IOException, ResourceInitializationException, UIMAException, - dkpro.similarity.algorithms.api.SimilarityException, LexicalSemanticResourceException { + dkpro.similarity.algorithms.api.SimilarityException, LexicalSemanticResourceException, ExecutionException, InterruptedException { // analyse the ontology classes - ontHandler.searchClasses(analizer); + if (keywordTool.equals(KeywordTool.TFIDF_BASED)) ontHandler.searchClassesTfIdfBased(); + else ontHandler.searchClasses(analizer); // read the requirements from JSON Map requirements = jsonHandler.readRequirement(json, projectId); // foreach requirement - for (Entry entry : requirements.entrySet()) { - String key = entry.getKey(); - String value = entry.getValue(); - if (key != null && value != null && !value.equals("")) { - // Apply NLP methods (syntactic approach) - List syntxResutls = analizer.requirementAnalysis(value); - - // Matching of extracted terms with the ontology, it is also applied the semantic appraoch - ontHandler.matching(syntxResutls, key, value, analizer, syny, thr); + + List deps = new ArrayList<>(); + + if (keywordTool.equals(KeywordTool.TFIDF_BASED)) { + Map syntxResutls = analizer.prepareRequirements(requirements); + WordEmbedding wordEmbedding = new WordEmbedding();// Declared here so it won't initialize every time + for (Entry entry : requirements.entrySet()) { + ontHandler.matching(syntxResutls.get(entry.getKey()), entry.getKey(), entry.getValue(), syny, thr, wordEmbedding); + } + // Extract dependencies from the ontology + deps = ontHandler.ontConflictDetection(); + } + + else if (keywordTool.equals(KeywordTool.RULE_BASED)) { + for (Entry entry : requirements.entrySet()) { + String key = entry.getKey(); + String value = entry.getValue(); + if (key != null && value != null && !value.equals("")) { + // Apply NLP methods (syntactic approach) + List syntxResutls = analizer.requirementAnalysis(value); + + // Matching of extracted terms with the ontology, it is also applied the semantic appraoch + ontHandler.matchingRuleBased(syntxResutls, key, value, analizer, syny, thr); + } } + // Extract dependencies from the ontology + deps = ontHandler.ontConflictDetection(); } - // Extract dependencies from the ontology - List deps = ontHandler.ontConflictDetection(); + + System.out.println(deps.size()); return jsonHandler.storeDependencies(json, deps); } } diff --git a/src/main/resources/ExcludedWords.txt b/src/main/resources/ExcludedWords.txt new file mode 100644 index 0000000..48b8841 --- /dev/null +++ b/src/main/resources/ExcludedWords.txt @@ -0,0 +1,1080 @@ +x +y +your +yours +yourself +yourselves +you +yond +yonder +yon +ye +yet +z +zillion +j +u +umpteen +usually +us +username +uponed +upons +uponing +upon +ups +upping +upped +up +unto +until +unless +unlike +unliker +unlikest +under +underneath +use +used +usedest +r +rath +rather +rathest +rathe +re +relate +related +relatively +regarding +really +res +respecting +respectively +q +quite +que +qua +n +neither +neaths +neath +nethe +nethermost +necessary +necessariest +necessarier +never +nevertheless +nigh +nighest +nigher +nine +noone +nobody +nobodies +nowhere +nowheres +no +noes +nor +nos +no-one +none +not +notwithstanding +nothings +nothing +nathless +natheless +t +ten +tills +till +tilled +tilling +to +towards +toward +towardest +towarder +together +too +thy +thyself +thus +than +that +those +thou +though +thous +thouses +thoroughest +thorougher +thorough +thoroughly +thru +thruer +thruest +thro +through +throughout +throughest +througher +thine +this +thises +they +thee +the +then +thence +thenest +thener +them +themselves +these +therer +there +thereby +therest +thereafter +therein +thereupon +therefore +their +theirs +thing +things +three +two +o +oh +owt +owning +owned +own +owns +others +other +otherwise +otherwisest +otherwiser +of +often +oftener +oftenest +off +offs +offest +one +ought +oughts +our +ours +ourselves +ourself +out +outest +outed +outwith +outs +outside +over +overallest +overaller +overalls +overall +overs +or +orer +orest +on +oneself +onest +ons +onto +a +atween +at +athwart +atop +afore +afterward +afterwards +after +afterest +afterer +ain +an +any +anything +anybody +anyone +anyhow +anywhere +anent +anear +and +andor +another +around +ares +are +aest +aer +against +again +accordingly +abaft +abafter +abaftest +abovest +above +abover +abouter +aboutest +about +aid +amidst +amid +among +amongst +apartest +aparter +apart +appeared +appears +appear +appearing +appropriating +appropriate +appropriatest +appropriates +appropriater +appropriated +already +always +also +along +alongside +although +almost +all +allest +aller +allyou +alls +albeit +awfully +as +aside +asides +aslant +ases +astrider +astride +astridest +astraddlest +astraddler +astraddle +availablest +availabler +available +aughts +aught +vs +v +variousest +variouser +various +via +vis-a-vis +vis-a-viser +vis-a-visest +viz +very +veriest +verier +versus +k +g +go +gone +good +got +gotta +gotten +get +gets +getting +b +by +byandby +by-and-by +bist +both +but +buts +be +beyond +because +became +becomes +become +becoming +becomings +becominger +becomingest +behind +behinds +before +beforehand +beforehandest +beforehander +bettered +betters +better +bettering +betwixt +between +beneath +been +below +besides +beside +m +my +myself +mucher +muchest +much +must +musts +musths +musth +main +make +mayest +many +mauger +maugre +me +meanwhiles +meanwhile +mostly +most +moreover +more +might +mights +midst +midsts +h +huh +humph +he +hers +herself +her +hereby +herein +hereafters +hereafter +hereupon +hence +hadst +had +having +haves +have +has +hast +hardly +hae +hath +him +himself +hither +hitherest +hitherer +his +how-do-you-do +however +how +howbeit +howdoyoudo +hoos +hoo +w +woulded +woulding +would +woulds +was +wast +we +wert +were +with +withal +without +within +why +what +whatever +whateverer +whateverest +whatsoeverer +whatsoeverest +whatsoever +whence +whencesoever +whenever +whensoever +when +whenas +whether +wheen +whereto +whereupon +wherever +whereon +whereof +where +whereby +wherewithal +wherewith +whereinto +wherein +whereafter +whereas +wheresoever +wherefrom +which +whichever +whichsoever +whilst +while +whiles +whithersoever +whither +whosoever +whoso +whomever +s +syne +syn +shalling +shalled +shalls +shoulding +should +shoulded +shoulds +she +sayyid +sayid +said +saider +saidest +same +samest +sames +samer +saved +sans +sanses +sanserifs +sanserif +so +soer +soest +sobeit +someone +somebody +somehow +some +somewhere +somewhat +something +sometimest +sometimes +sometimer +sometime +several +severaler +severalest +serious +seriousest +seriouser +senza +send +sent +seem +seems +seemed +seemingest +seeminger +seemings +seven +summat +sups +sup +supping +supped +such +since +sine +sines +sith +six +stop +stopped +p +plaintiff +plenty +plenties +please +pleased +pleases +per +perhaps +particulars +particularly +particular +particularest +particularer +pro +providing +provides +provided +provide +probably +l +layabout +layabouts +latter +latterest +latterer +latterly +latters +lots +lotting +lotted +lot +lest +less +ie +ifs +if +i +info +information +itself +its +it +is +idem +idemer +idemest +immediate +immediately +immediatest +immediater +in +inwards +inwardest +inwarder +inward +inasmuch +into +instead +insofar +indicates +indicated +indicate +indicating +indeed +inc +f +fact +facts +fs +figupon +figupons +figuponing +figuponed +few +fewer +fewest +frae +from +failing +failings +five +furthers +furtherer +furthered +furtherest +further +furthering +furthermore +fourscore +followthrough +for +forwhy +fornenst +formerly +former +formerer +formerest +formers +forbye +forby +fore +forever +forer +fores +four +d +ddays +dday +do +doing +doings +doe +does +doth +downwarder +downwardest +downward +downwards +downs +done +doner +dones +donest +dos +dost +did +differentest +differenter +different +describing +describe +describes +described +despiting +despites +despited +despite +during +c +cum +circa +chez +cer +certain +certainest +certainer +cest +canst +cannot +cant +cants +canting +cantest +canted +co +could +couldst +comeon +comeons +come-ons +come-on +concerning +concerninger +concerningest +consequently +considering +e +eg +eight +either +even +evens +evenser +evensest +evened +evenest +ever +everyone +everything +everybody +everywhere +every +ere +each +et +elsewhere +else +ex +excepted +excepts +except +excepting +exes +enough +ins +able +abst +accordance +according +across +act +actually +added +adj +affected +affecting +affects +ah +alone +am +announce +anymore +anyway +anyways +apparently +approximately +aren +arent +arise +ask +asking +auth +away +back +begin +beginning +beginnings +begins +being +believe +biol +brief +briefly +ca +came +can +can't +cause +causes +certainly +com +come +comes +contain +containing +contains +couldnt +date +didn't +doesn't +doesnt +isnt +wont +don't +down +due +ed +edu +effect +eighty +end +ending +especially +et-al +far +ff +fifth +first +fix +followed +following +follows +forth +found +gave +give +given +gives +giving +goes +happens +hasn't +haven't +hed +here +heres +hes +hi +hid +home +hundred +id +i'll +im +importance +important +index +invention +isn't +itd +it'll +i've +just +keep +keeps +kept +kg +km +know +known +knows +largely +last +lately +later +least +let +lets +like +liked +likely +line +little +'ll +look +looking +looks +ltd +made +mainly +makes +may +maybe +mean +means +meantime +merely +mg +million +miss +ml +mr +mrs +mug +na +name +namely +nay +nd +near +nearly +necessarily +need +needs +new +next +ninety +non +nonetheless +normally +noted +now +obtain +obtained +obviously +ok +okay +old +omitted +once +ones +only +ord +owing +page +pages +part +past +placed +plus +poorly +possible +possibly +potentially +pp +predominantly +present +previously +primarily +promptly +proud +put +quickly +qv +ran +rd +readily +recent +recently +ref +refs +regardless +regards +research +resulted +resulting +results +right +run +saw +say +saying +says +sec +section +see +seeing +seeming +seen +self +selves +shall +shed +she'll +shes +shouldn't +show +showed +shown +showns +shows +significant +significantly +similar +similarly +slightly +somethan +soon +sorry +specifically +specified +specify +specifying +still +strongly +sub +substantially +successfully +sufficiently +suggest +sure +take +taken +taking +tell +tends +th +thank +thanks +thanx +that'll +thats +that've +thered +there'll +thereof +therere +theres +thereto +there've +theyd +they'll +theyre +they've +think +thoughh +thousand +throug +til +tip +took +tried +tries +truly +try +trying +ts +twice +un +unfortunately +unlikely +useful +usefully +usefulness +uses +using +value +'ve +vol +vols +want +wants +wasnt +way +wed +welcome +we'll +went +werent +we've +what'll +whats +wheres +whim +who +whod +whoever +whole +who'll +whom +whos +whose +widely +willing +wish +wont +words +world +wouldnt +www +yes +youd +you'll +youre +you've +zero +due +don +out +only +what +e.g +doesn +de +re \ No newline at end of file diff --git a/src/test/java/com/gessi/dependency_detection/AppTest.java b/src/test/java/com/gessi/dependency_detection/AppTest.java index cebf557..a4de7a4 100644 --- a/src/test/java/com/gessi/dependency_detection/AppTest.java +++ b/src/test/java/com/gessi/dependency_detection/AppTest.java @@ -58,7 +58,7 @@ public void Success() throws Exception { "application/json", jsonFile.toString().getBytes()); - this.mockMvc.perform(MockMvcRequestBuilders.fileUpload("/upc/dependency-detection/json/ontology/ABC?synonymy=true&threshold=0.1") + this.mockMvc.perform(MockMvcRequestBuilders.fileUpload("/upc/dependency-detection/json/ontology/ABC?synonymy=false&threshold=0.1&keywordTool=RULE_BASED") .file(ontology) .file(json)) .andExpect(status().isOk());