diff --git a/pom.xml b/pom.xml index c5e26378..322f3bed 100644 --- a/pom.xml +++ b/pom.xml @@ -22,7 +22,7 @@ The Archives Unleashed Project 2.11.8 2.6.5 - 2.3.1 + 2.4.3 github 2.17 3.0 @@ -42,6 +42,7 @@ 2.18.1 0.7.5.201505241946 2.1 + 1.20 @@ -64,6 +65,10 @@ maven http://repo.maven.apache.org/maven2/ + + jitpack.io + https://jitpack.io + @@ -480,25 +485,98 @@ scala-parser-combinators_2.11 1.0.5 - - com.fasterxml.jackson.module - jackson-module-scala_2.11 - 2.8.8 - org.apache.hadoop hadoop-mapreduce-client-core ${hadoop.version} + + + com.google.protobuf + protobuf-java + + + org.slf4j + slf4j-api + + + javax.xml.bind + jaxb-api + + org.apache.hadoop hadoop-common ${hadoop.version} + + + commons-configuration + commons-configuration + + + com.google.protobuf + protobuf-java + + + commons-codec + commons-codec + + + commons-io + commons-io + + + com.google.code.gson + gson + + + org.slf4j + slf4j-api + + + org.apache.commons + commons-math3 + + org.apache.spark spark-core_2.11 ${spark.version} + + + org.apache.commons + commons-lang3 + + + org.apache.commons + commons-compress + + + com.thoughtworks.paranamer + paranamer + + + javax.servlet + javax.servlet-api + + + org.slf4j + slf4j-api + + + org.slf4j + jul-to-slf4j + + + org.slf4j + jcl-over-slf4j + + + javax.ws.rs + javax.ws.rs-api + + org.apache.spark @@ -510,6 +588,11 @@ spark-graphx_2.11 ${spark.version} + + com.google.guava + guava + 15.0 + org.xerial.snappy snappy-java @@ -529,22 +612,69 @@ org.apache.hadoop hadoop-core + + joda-time + joda-time + + + commons-codec + commons-codec + + + commons-io + commons-io + edu.stanford.nlp stanford-corenlp 3.8.0 + + + org.apache.commons + commons-lang3 + + + org.slf4j + slf4j-api + + org.apache.tika tika-core - 1.19.1 + ${tika.version} org.apache.tika tika-parsers - 1.19.1 + ${tika.version} + + + com.fasterxml.jackson.core + jackson-databind + + + javax.xml.bind + jaxb-api + + + javax.ws.rs + javax.ws.rs-api + + + + + org.apache.tika + tika-langdetect + ${tika.version} + + + com.optimaize.languagedetector + language-detector + + org.rogach @@ -566,82 +696,19 @@ lintools-datatypes 1.0.0 - + - joda-time - joda-time - 2.2 + com.github.netarchivesuite + language-detector + language-detector-0.6a + + org.apache.commons commons-compress 1.18 - - net.java.dev.jets3t - jets3t - 0.6.1 - - - org.codehaus.jackson - jackson-mapper-asl - 1.5.2 - - - commons-net - commons-net - 1.4.1 - - - commons-logging - commons-logging - 1.1.3 - - - org.slf4j - jcl-over-slf4j - 1.7.24 - - - org.slf4j - jul-to-slf4j - 1.7.24 - - - org.slf4j - slf4j-api - 1.7.24 - - - javax.servlet - javax.servlet-api - 3.0.1 - - - org.scala-lang.modules - scala-xml_2.11 - 1.0.6 - - - com.fasterxml.jackson.core - jackson-databind - 2.8.11 - - - org.json - json - 20140107 - - - com.google.code.gson - gson - 2.3.1 - - - com.thoughtworks.paranamer - paranamer - 2.8 - jline jline @@ -650,7 +717,17 @@ org.apache.commons commons-math3 - 3.2 + 3.6.1 + + + commons-logging + commons-logging + 1.2 + + + org.apache.httpcomponents + httpcore + 4.4.10 diff --git a/src/main/scala/io/archivesunleashed/app/CommandLineApp.scala b/src/main/scala/io/archivesunleashed/app/CommandLineApp.scala index 6fcc3607..ac501a56 100644 --- a/src/main/scala/io/archivesunleashed/app/CommandLineApp.scala +++ b/src/main/scala/io/archivesunleashed/app/CommandLineApp.scala @@ -178,9 +178,13 @@ class CommandLineApp(conf: CmdAppConf) { def save(d: Dataset[Row]): Unit = { if (!configuration.partition.isEmpty) { - d.coalesce(configuration.partition()).write.csv(saveTarget) + d.coalesce(configuration.partition()).write + .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") + .csv(saveTarget) } else { - d.write.csv(saveTarget) + d.write + .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") + .csv(saveTarget) } } diff --git a/src/main/scala/io/archivesunleashed/matchbox/DetectLanguage.scala b/src/main/scala/io/archivesunleashed/matchbox/DetectLanguage.scala index c51d4a37..5f0386a4 100644 --- a/src/main/scala/io/archivesunleashed/matchbox/DetectLanguage.scala +++ b/src/main/scala/io/archivesunleashed/matchbox/DetectLanguage.scala @@ -16,7 +16,9 @@ */ package io.archivesunleashed.matchbox -import org.apache.tika.language.LanguageIdentifier +import org.apache.tika.langdetect.OptimaizeLangDetector; +import org.apache.tika.language.detect.LanguageDetector; +import org.apache.tika.language.detect.LanguageResult; /** Detects language using Apache Tika. */ object DetectLanguage { @@ -30,7 +32,9 @@ object DetectLanguage { if (input.isEmpty) { "" } else { - new LanguageIdentifier(input).getLanguage + val detector: LanguageDetector = new OptimaizeLangDetector().loadModels() + val result : LanguageResult = detector.detect(input) + result.getLanguage() } } } diff --git a/src/test/scala/io/archivesunleashed/ArcTest.scala b/src/test/scala/io/archivesunleashed/ArcTest.scala index 8824715e..a8ec41b9 100644 --- a/src/test/scala/io/archivesunleashed/ArcTest.scala +++ b/src/test/scala/io/archivesunleashed/ArcTest.scala @@ -92,7 +92,7 @@ class ArcTest extends FunSuite with BeforeAndAfter { .collect languageCounts.foreach { - case ("en", count) => assert(57L == count) + case ("en", count) => assert(135L == count) case ("et", count) => assert(6L == count) case ("it", count) => assert(1L == count) case ("lt", count) => assert(61L == count) diff --git a/src/test/scala/io/archivesunleashed/RecordRDDTest.scala b/src/test/scala/io/archivesunleashed/RecordRDDTest.scala index 19f7567a..be4d03b2 100644 --- a/src/test/scala/io/archivesunleashed/RecordRDDTest.scala +++ b/src/test/scala/io/archivesunleashed/RecordRDDTest.scala @@ -102,8 +102,8 @@ class RecordRDDTest extends FunSuite with BeforeAndAfter { val base2 = RecordLoader.loadArchives(arcPath, sc) .keepValidPages() val langs: Set[String] = Set("en", "fr") - val r = Array("http://www.archive.org/index.php", - "http://www.archive.org/details/DrinkingWithBob-MadonnaAdoptsAfricanBaby887") + val r = Array("http://www.archive.org/", + "http://www.archive.org/index.php") val r2 = base2.keepLanguages(langs) .map(r => r.getUrl).take(2) assert (r2.sameElements(r))