Minor changes in helps

HY-UDBMS · Jul 19, 2019 · 177fb81 · 177fb81
1 parent 8c8bad3
commit 177fb81
Show file tree

Hide file tree

Showing 5 changed files with 109 additions and 39 deletions.
diff --git a/README.md b/README.md
@@ -12,47 +12,114 @@ If you want to develop AU-Join, you should have at least JDK 8 installed, an IDE
 
 ## Usage
 
-To get help, run `./AU-Join --help`.
+This program consists of two parts: `AU-Join` for similarity join and `AU-Esti` for estimating the best overlap constraint.
+
+To get help, run `./AU-Join --help` or `./AU-Esti --help`.
+
+##### AU-Join Usage
 
 ```
 usage: [-h] [--jaccard JACCARD] [--taxonomy TAXONOMY] [--synonym SYNONYM]
-       [-j THREAD] [-c COMMON] [-o OUTPUT] [THRESHOLD] [LIST_1] [LIST_2]
+       [-c COMMON] [--filter-fast] [--verify-greedy] [--single] [-o OUTPUT]
+       [THRESHOLD] [LIST_1] [LIST_2]
 
 optional arguments:
-  -h, --help            show this help message and exit
+  -h, --help                    show this help message and exit
 
-  --jaccard JACCARD     gram length for Jaccard similarity (> 1)
+  --jaccard JACCARD             enable Jaccard similarity and set gram length
+                                (> 1)
 
-  --taxonomy TAXONOMY   filename of taxonomy knowledge
+  --taxonomy TAXONOMY           enable taxonomy similarity and specify the
+                                filename of taxonomy knowledge
 
-  --synonym SYNONYM     filename of synonym knowledge
+  --synonym SYNONYM             enable synonym similarity and specify the
+                                filename of synonym knowledge
 
-  -j THREAD,            number of threads for filtering and verification
-  --thread THREAD       (default: number of cores minus 2)
+  -c COMMON, --common COMMON    number of common signatures (default: 1)
 
-  -c COMMON,            number of common signatures (default: 1)
-  --common COMMON
+  --filter-fast, --filter-dp    specify the filtering method: Fast (Heuristic)
+                                and DP (Dynamic Programming) (default:
+                                --filter-fast)
 
-  -o OUTPUT,            name of a file for writing join results (default: to
-  --output OUTPUT       stdout)
+  --verify-greedy,              specify the verification method: Greedy,
+  --verify-squareimp,           SquareImp, or our improved SquareImp (default:
+  --verify-squareimp-improved   --verify-greedy)
 
+  --single                      perform filtering and verification on a single
+                                thread (default: on multiple threads)
+
+  -o OUTPUT, --output OUTPUT    method for handling join results: null (no
+                                output), stdout (to standard output), or a
+                                filename (output as csv) (default: -o null)
 
 positional arguments:
-  THRESHOLD             similarity threshold (0, 1]
+  THRESHOLD                     similarity threshold (0, 1]
 
-  LIST_1                filename of the first segmented string list
+  LIST_1                        filename of the first segmented string list
 
-  LIST_2                filename of the second segmented string list
+  LIST_2                        filename of the second segmented string list
 
+example: ./AU-Join --taxonomy tax.txt --synonym syn.txt --jaccard 3 -c3
+-oresult.csv 0.9 list1.txt list2.txt
+```
+
+##### AU-Esti Usage
 
-Example: ./AU-Join --taxonomy tax.txt --synonym syn.txt --jaccard 3 -c3 -oresult.csv 0.9 list1.txt list2.txt
 ```
+usage: [-h] [--jaccard JACCARD] [--taxonomy TAXONOMY] [--synonym SYNONYM]
+       [--filter-fast] [--verify-greedy] [--single] [-s SAMPLE_SIZE]
+       [-q QUANTILE] [-i ITERATION] [THRESHOLD] [LIST_1] [LIST_2] [OVERLAPS]...
+
+optional arguments:
+  -h, --help                    show this help message and exit
+
+  --jaccard JACCARD             enable Jaccard similarity and set gram length
+                                (> 1)
+
+  --taxonomy TAXONOMY           enable taxonomy similarity and specify the
+                                filename of taxonomy knowledge
+
+  --synonym SYNONYM             enable synonym similarity and specify the
+                                filename of synonym knowledge
+
+  --filter-fast, --filter-dp    specify the filtering method: Fast (Heuristic)
+                                and DP (Dynamic Programming) (default:
+                                --filter-fast)
+
+  --verify-greedy,              specify the verification method: Greedy,
+  --verify-squareimp,           SquareImp, or our improved SquareImp (default:
+  --verify-squareimp-improved   --verify-greedy)
+
+  --single                      perform filtering and verification on a single
+                                thread (default: on multiple threads)
+
+  -s SAMPLE_SIZE,               specify the expected sample size for
+  --sample-size SAMPLE_SIZE     estimation (> 0, default: 100)
+
+  -q QUANTILE,                  specify the quantile for Student
+  --quantile QUANTILE           t-distribution (default: 0.842 for 60%
+                                confidence levels on both sides)
+
+  -i ITERATION,                 limit the number of iterations (> 0, default:
+  --iteration ITERATION         20)
+
+positional arguments:
+  THRESHOLD                     similarity threshold (0, 1]
+
+  LIST_1                        filename of the first segmented string list
+
+  LIST_2                        filename of the second segmented string list
+
+  OVERLAPS                      values of overlap to be tested
+
+example: ./AU-Esti --taxonomy tax.txt --synonym syn.txt --jaccard 3 0.9 list1.txt
+list2.txt 1 2 3 4 5
+```
+
+## Feedback
 
-## Comments and feedback
+Pengfei Xu (pengfei.xu[at]helsinki[dot]fi) and Jiaheng Lu (jiahenglu[at]gmail[dot]com)
 
-Pengfei Xu ([email protected]) and Jiaheng Lu ([email protected])
+## License
 
-## Next version will include
-* `SquareImp`-based verification algorithm
-* DP prefix selection
-* Sampling algorithm
+MIT License
diff --git a/src/main/kotlin/fi/helsinki/cs/udbms/MainEstimation.kt b/src/main/kotlin/fi/helsinki/cs/udbms/MainEstimation.kt
@@ -32,6 +32,8 @@ import kotlin.math.pow
 import kotlin.system.measureTimeMillis
 
 fun main(args: Array<String>): Unit = mainBody {
+    println("Arguments: ${args.joinToString(separator = " ")}")
+
     val params = EstimationParameters.initialise(args)
     Dispatcher.initialise(params.singleThread)
 
@@ -66,7 +68,7 @@ fun main(args: Array<String>): Unit = mainBody {
 
     /*=================================================================*/
 
-    print("Running test drive... ")
+    print("Test driving... ")
     var verified = false
     val (filterTime, verifyTime) = params.overlapList.map {
         val result = testDrive(
@@ -96,12 +98,12 @@ fun main(args: Array<String>): Unit = mainBody {
     val p1 = params.sampleSize.toDouble() / list1.size
     val p2 = params.sampleSize.toDouble() / list2.size
 
-    var lastEstimations = emptyMap<Int, Estimation>()
+    var lastEstimations = emptyMap<Int, Estimation>() // Map[overlap, estimation]
 
     var iteration = 0
     while (++iteration <= params.iteration) {
-        val sample1 = getBernoulliSample(list1, p1)
-        val sample2 = getBernoulliSample(list2, p2)
+        val sample1 = list1.getBernoulliSample(p1).toList()
+        val sample2 = list2.getBernoulliSample(p2).toList()
 
         val estimations = params.overlapList.map { overlap ->
             val result = testDrive(params, sample1, sample2, pebbles1, pebbles2, order, syn, tax, overlap, true)
@@ -132,26 +134,25 @@ fun main(args: Array<String>): Unit = mainBody {
     }
 
     print("Overlap parameters from the best to the worst: ")
-    print(lastEstimations.values.sortedBy { it.scaledCost }.map { it.overlap }.joinToString())
+    print(lastEstimations.values.sortedBy { it.meanOfScaledCost }.map { it.overlap }.joinToString())
     println()
 
     Dispatcher.shutdown()
     return@mainBody
 }
 
 private fun shouldStop(iteration: Int, estimations: Map<Int, Estimation>): Boolean {
-    if (estimations.size == 1) return true
-    if (iteration < 5) return false
+    if (estimations.size == 1) return true // if only one candidate, stop anyway
+    if (iteration < 5) return false // discard instability in early stages
 
     val estimationsSorted = estimations.values.sortedBy { it.scaledCost }
 
     return estimationsSorted.first().getMaxScaledCost() < estimationsSorted.drop(1).first().getMinScaledCost()
 }
 
-private fun <T> getBernoulliSample(data: Iterable<T>, p: Double): List<T> {
-    val rand = ThreadLocalRandom.current()
+private fun <T> Iterable<T>.getBernoulliSample(p: Double): Iterable<T> {
     @Suppress("UNCHECKED_CAST")
-    return data.mapParallel { if (rand.nextDouble(1.0) < p) it else Unit }.filterNot { it == Unit } as List<T>
+    return this.mapParallel { if (ThreadLocalRandom.current().nextDouble(1.0) < p) it else Unit }.filterNot { it == Unit } as Iterable<T>
 }
 
 // region Estimation

diff --git a/src/main/kotlin/fi/helsinki/cs/udbms/MainJoin.kt b/src/main/kotlin/fi/helsinki/cs/udbms/MainJoin.kt
@@ -32,6 +32,8 @@ import java.io.File
 import kotlin.system.measureTimeMillis
 
 fun main(args: Array<String>) = mainBody {
+    println("Arguments: ${args.joinToString(separator = " ")}")
+
     val params = JoinParameters.initialise(args)
     Dispatcher.initialise(params.singleThread)
 

diff --git a/src/main/kotlin/fi/helsinki/cs/udbms/util/EstimationParameters.kt b/src/main/kotlin/fi/helsinki/cs/udbms/util/EstimationParameters.kt
@@ -41,7 +41,7 @@ class EstimationParameters(parser: ArgParser) {
                 args,
                 helpFormatter = DefaultHelpFormatter(
                     epilogue = """
-                    Example: ./AU-Est --taxonomy tax.txt --synonym syn.txt --jaccard 3 -oresult.csv 0.9 list1.txt list2.txt 1 2 3 4 5
+                    example: ./AU-Esti --taxonomy tax.txt --synonym syn.txt --jaccard 3 0.9 list1.txt list2.txt 1 2 3 4 5
                 """.trimIndent()
                 )
             ).parseInto(::EstimationParameters)
@@ -75,14 +75,14 @@ class EstimationParameters(parser: ArgParser) {
     val filter by parser.mapping(
         "--filter-fast" to "Fast",
         "--filter-dp" to "DP",
-        help = "Specify the filtering method: Fast (Heuristic) and DP (Dynamic Programming) (default: --filter-fast)"
+        help = "specify the filtering method: Fast (Heuristic) and DP (Dynamic Programming) (default: --filter-fast)"
     ).default { "Fast" }
 
     val verify by parser.mapping(
         "--verify-greedy" to "Greedy",
         "--verify-squareimp" to "SquareImp",
         "--verify-squareimp-improved" to "SquareImp-Improved",
-        help = "Specify the verification method: Greedy, SquareImp, or our improved SquareImp (default: --verify-greedy)"
+        help = "specify the verification method: Greedy, SquareImp, or our improved SquareImp (default: --verify-greedy)"
     ).default { "Greedy" }
 
     val singleThread by parser.flagging(
@@ -132,7 +132,7 @@ class EstimationParameters(parser: ArgParser) {
 
     val overlapList by parser.positionalList(
         "OVERLAPS",
-        "Values of overlap to be tested",
+        "values of overlap to be tested",
         1..Int.MAX_VALUE
     ) { toInt() }.default { emptyList() }.addValidator {
         if (value.isEmpty()) throw InvalidArgumentException("You muse specify at least one overlap value")

diff --git a/src/main/kotlin/fi/helsinki/cs/udbms/util/JoinParameters.kt b/src/main/kotlin/fi/helsinki/cs/udbms/util/JoinParameters.kt
@@ -41,7 +41,7 @@ class JoinParameters(parser: ArgParser) {
                 args,
                 helpFormatter = DefaultHelpFormatter(
                     epilogue = """
-                    Example: ./AU-Join --taxonomy tax.txt --synonym syn.txt --jaccard 3 -c3 -oresult.csv 0.9 list1.txt list2.txt
+                    example: ./AU-Join --taxonomy tax.txt --synonym syn.txt --jaccard 3 -c3 -oresult.csv 0.9 list1.txt list2.txt
                 """.trimIndent()
                 )
             ).parseInto(::JoinParameters)
@@ -82,14 +82,14 @@ class JoinParameters(parser: ArgParser) {
     val filter by parser.mapping(
         "--filter-fast" to "Fast",
         "--filter-dp" to "DP",
-        help = "Specify the filtering method: Fast (Heuristic) and DP (Dynamic Programming) (default: --filter-fast)"
+        help = "specify the filtering method: Fast (Heuristic) and DP (Dynamic Programming) (default: --filter-fast)"
     ).default { "Fast" }
 
     val verify by parser.mapping(
         "--verify-greedy" to "Greedy",
         "--verify-squareimp" to "SquareImp",
         "--verify-squareimp-improved" to "SquareImp-Improved",
-        help = "Specify the verification method: Greedy, SquareImp, or our improved SquareImp (default: --verify-greedy)"
+        help = "specify the verification method: Greedy, SquareImp, or our improved SquareImp (default: --verify-greedy)"
     ).default { "Greedy" }
 
     val singleThread by parser.flagging(