-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
beginning of the dawn of nlp (untested)
- Loading branch information
Showing
2 changed files
with
89 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,76 @@ | ||
package ptrman.causalReasoningSystem.nlp; | ||
|
||
import ptrman.causalReasoningSystem.InputGraph; | ||
|
||
import java.util.*; | ||
|
||
/** | ||
* | ||
*/ | ||
public class Builder { | ||
public static class TokenWithIndex { | ||
String token; | ||
int index; | ||
|
||
public TokenWithIndex(String token, int index) { | ||
this.token = token; | ||
this.index = index; | ||
} | ||
} | ||
|
||
public static List<TokenWithIndex> fillGraphWithSentences(InputGraph graph, List<Sentence> sentences) { | ||
List<TokenWithIndex> uniqueTokens = calcUniqueTokens(sentences); | ||
final int lastTokenIndex = uniqueTokens.get(uniqueTokens.size()-1).index; | ||
int iterationSentenceIndex = lastTokenIndex+1; | ||
|
||
for( Sentence currentSentence : sentences ) { | ||
currentSentence.causalRootGraphIndex = iterationSentenceIndex; | ||
|
||
// connect from the words to the root | ||
// we do this in this direction because the sentences bind better to tighter bound words | ||
|
||
for( String iterationToken : currentSentence.tokens ) { | ||
int tokenIndex = getTokenIndex(uniqueTokens, iterationToken); | ||
graph.connections.add(new InputGraph.Connection(tokenIndex, currentSentence.causalRootGraphIndex)); | ||
} | ||
|
||
iterationSentenceIndex++; | ||
} | ||
|
||
|
||
return uniqueTokens; | ||
} | ||
|
||
private static int getTokenIndex(final List<TokenWithIndex> tokenWithIndexes, final String token) { | ||
for( TokenWithIndex iterationToken : tokenWithIndexes ) { | ||
if( token.equals(iterationToken.token) ) { | ||
return iterationToken.index; | ||
} | ||
} | ||
|
||
throw new RuntimeException("Internal Error"); | ||
} | ||
|
||
private static List<TokenWithIndex> calcUniqueTokens(List<Sentence> sentences) { | ||
Set<String> uniqueSet = new HashSet<>(); | ||
|
||
for( Sentence iterationSentence : sentences ) { | ||
for( String iterationToken : iterationSentence.tokens ) { | ||
uniqueSet.add(iterationToken); | ||
} | ||
} | ||
|
||
String[] uniqueStringArray = new String[uniqueSet.size()]; | ||
uniqueStringArray = uniqueSet.toArray(uniqueStringArray); | ||
|
||
List<TokenWithIndex> result = new ArrayList<>(); | ||
int i = 0; | ||
|
||
for( String iterationToken : uniqueStringArray ) { | ||
result.add(new TokenWithIndex(iterationToken, i)); | ||
i++; | ||
} | ||
|
||
return result; | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
package ptrman.causalReasoningSystem.nlp; | ||
|
||
import java.util.ArrayList; | ||
import java.util.List; | ||
|
||
/** | ||
* Created by r0b3 on 08.12.2015. | ||
*/ | ||
public class Sentence { | ||
public List<String> tokens = new ArrayList<>(); | ||
|
||
int causalRootGraphIndex; | ||
} |