diff --git a/pom.xml b/pom.xml
index c5e26378..322f3bed 100644
--- a/pom.xml
+++ b/pom.xml
@@ -22,7 +22,7 @@
The Archives Unleashed Project
2.11.8
2.6.5
- 2.3.1
+ 2.4.3
github
2.17
3.0
@@ -42,6 +42,7 @@
2.18.1
0.7.5.201505241946
2.1
+ 1.20
@@ -64,6 +65,10 @@
maven
http://repo.maven.apache.org/maven2/
+
+ jitpack.io
+ https://jitpack.io
+
@@ -480,25 +485,98 @@
scala-parser-combinators_2.11
1.0.5
-
- com.fasterxml.jackson.module
- jackson-module-scala_2.11
- 2.8.8
-
org.apache.hadoop
hadoop-mapreduce-client-core
${hadoop.version}
+
+
+ com.google.protobuf
+ protobuf-java
+
+
+ org.slf4j
+ slf4j-api
+
+
+ javax.xml.bind
+ jaxb-api
+
+
org.apache.hadoop
hadoop-common
${hadoop.version}
+
+
+ commons-configuration
+ commons-configuration
+
+
+ com.google.protobuf
+ protobuf-java
+
+
+ commons-codec
+ commons-codec
+
+
+ commons-io
+ commons-io
+
+
+ com.google.code.gson
+ gson
+
+
+ org.slf4j
+ slf4j-api
+
+
+ org.apache.commons
+ commons-math3
+
+
org.apache.spark
spark-core_2.11
${spark.version}
+
+
+ org.apache.commons
+ commons-lang3
+
+
+ org.apache.commons
+ commons-compress
+
+
+ com.thoughtworks.paranamer
+ paranamer
+
+
+ javax.servlet
+ javax.servlet-api
+
+
+ org.slf4j
+ slf4j-api
+
+
+ org.slf4j
+ jul-to-slf4j
+
+
+ org.slf4j
+ jcl-over-slf4j
+
+
+ javax.ws.rs
+ javax.ws.rs-api
+
+
org.apache.spark
@@ -510,6 +588,11 @@
spark-graphx_2.11
${spark.version}
+
+ com.google.guava
+ guava
+ 15.0
+
org.xerial.snappy
snappy-java
@@ -529,22 +612,69 @@
org.apache.hadoop
hadoop-core
+
+ joda-time
+ joda-time
+
+
+ commons-codec
+ commons-codec
+
+
+ commons-io
+ commons-io
+
edu.stanford.nlp
stanford-corenlp
3.8.0
+
+
+ org.apache.commons
+ commons-lang3
+
+
+ org.slf4j
+ slf4j-api
+
+
org.apache.tika
tika-core
- 1.19.1
+ ${tika.version}
org.apache.tika
tika-parsers
- 1.19.1
+ ${tika.version}
+
+
+ com.fasterxml.jackson.core
+ jackson-databind
+
+
+ javax.xml.bind
+ jaxb-api
+
+
+ javax.ws.rs
+ javax.ws.rs-api
+
+
+
+
+ org.apache.tika
+ tika-langdetect
+ ${tika.version}
+
+
+ com.optimaize.languagedetector
+ language-detector
+
+
org.rogach
@@ -566,82 +696,19 @@
lintools-datatypes
1.0.0
-
+
- joda-time
- joda-time
- 2.2
+ com.github.netarchivesuite
+ language-detector
+ language-detector-0.6a
+
+
org.apache.commons
commons-compress
1.18
-
- net.java.dev.jets3t
- jets3t
- 0.6.1
-
-
- org.codehaus.jackson
- jackson-mapper-asl
- 1.5.2
-
-
- commons-net
- commons-net
- 1.4.1
-
-
- commons-logging
- commons-logging
- 1.1.3
-
-
- org.slf4j
- jcl-over-slf4j
- 1.7.24
-
-
- org.slf4j
- jul-to-slf4j
- 1.7.24
-
-
- org.slf4j
- slf4j-api
- 1.7.24
-
-
- javax.servlet
- javax.servlet-api
- 3.0.1
-
-
- org.scala-lang.modules
- scala-xml_2.11
- 1.0.6
-
-
- com.fasterxml.jackson.core
- jackson-databind
- 2.8.11
-
-
- org.json
- json
- 20140107
-
-
- com.google.code.gson
- gson
- 2.3.1
-
-
- com.thoughtworks.paranamer
- paranamer
- 2.8
-
jline
jline
@@ -650,7 +717,17 @@
org.apache.commons
commons-math3
- 3.2
+ 3.6.1
+
+
+ commons-logging
+ commons-logging
+ 1.2
+
+
+ org.apache.httpcomponents
+ httpcore
+ 4.4.10
diff --git a/src/main/scala/io/archivesunleashed/app/CommandLineApp.scala b/src/main/scala/io/archivesunleashed/app/CommandLineApp.scala
index 6fcc3607..ac501a56 100644
--- a/src/main/scala/io/archivesunleashed/app/CommandLineApp.scala
+++ b/src/main/scala/io/archivesunleashed/app/CommandLineApp.scala
@@ -178,9 +178,13 @@ class CommandLineApp(conf: CmdAppConf) {
def save(d: Dataset[Row]): Unit = {
if (!configuration.partition.isEmpty) {
- d.coalesce(configuration.partition()).write.csv(saveTarget)
+ d.coalesce(configuration.partition()).write
+ .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ")
+ .csv(saveTarget)
} else {
- d.write.csv(saveTarget)
+ d.write
+ .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ")
+ .csv(saveTarget)
}
}
diff --git a/src/main/scala/io/archivesunleashed/matchbox/DetectLanguage.scala b/src/main/scala/io/archivesunleashed/matchbox/DetectLanguage.scala
index c51d4a37..5f0386a4 100644
--- a/src/main/scala/io/archivesunleashed/matchbox/DetectLanguage.scala
+++ b/src/main/scala/io/archivesunleashed/matchbox/DetectLanguage.scala
@@ -16,7 +16,9 @@
*/
package io.archivesunleashed.matchbox
-import org.apache.tika.language.LanguageIdentifier
+import org.apache.tika.langdetect.OptimaizeLangDetector;
+import org.apache.tika.language.detect.LanguageDetector;
+import org.apache.tika.language.detect.LanguageResult;
/** Detects language using Apache Tika. */
object DetectLanguage {
@@ -30,7 +32,9 @@ object DetectLanguage {
if (input.isEmpty) {
""
} else {
- new LanguageIdentifier(input).getLanguage
+ val detector: LanguageDetector = new OptimaizeLangDetector().loadModels()
+ val result : LanguageResult = detector.detect(input)
+ result.getLanguage()
}
}
}
diff --git a/src/test/scala/io/archivesunleashed/ArcTest.scala b/src/test/scala/io/archivesunleashed/ArcTest.scala
index 8824715e..a8ec41b9 100644
--- a/src/test/scala/io/archivesunleashed/ArcTest.scala
+++ b/src/test/scala/io/archivesunleashed/ArcTest.scala
@@ -92,7 +92,7 @@ class ArcTest extends FunSuite with BeforeAndAfter {
.collect
languageCounts.foreach {
- case ("en", count) => assert(57L == count)
+ case ("en", count) => assert(135L == count)
case ("et", count) => assert(6L == count)
case ("it", count) => assert(1L == count)
case ("lt", count) => assert(61L == count)
diff --git a/src/test/scala/io/archivesunleashed/RecordRDDTest.scala b/src/test/scala/io/archivesunleashed/RecordRDDTest.scala
index 19f7567a..be4d03b2 100644
--- a/src/test/scala/io/archivesunleashed/RecordRDDTest.scala
+++ b/src/test/scala/io/archivesunleashed/RecordRDDTest.scala
@@ -102,8 +102,8 @@ class RecordRDDTest extends FunSuite with BeforeAndAfter {
val base2 = RecordLoader.loadArchives(arcPath, sc)
.keepValidPages()
val langs: Set[String] = Set("en", "fr")
- val r = Array("http://www.archive.org/index.php",
- "http://www.archive.org/details/DrinkingWithBob-MadonnaAdoptsAfricanBaby887")
+ val r = Array("http://www.archive.org/",
+ "http://www.archive.org/index.php")
val r2 = base2.keepLanguages(langs)
.map(r => r.getUrl).take(2)
assert (r2.sameElements(r))