-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* started development on the tests for morton ordering * added 2 test for matchColumnWithType for numerical types and mixed types * added branch for getNonStringBinaryDF incase the user is only morton ordering string columns * added another string column to the test data; add HashDataFrame helper object in testing for checksumming df; created a mortonStr class to test no numerical column * created numeric test for getNonStringBinaryDF * create test for getBinaryDF with str columns only * created test for getBinaryDF with only numeric columns * created test getBinaryDF with mixed data types * added a column length check in Morton * added a test for Morton class instaniation with one column * added test for mortonIndex * removed variable that shouldnt have been in the test * created github workflow
- Loading branch information
1 parent
3af3710
commit 72ccae1
Showing
21 changed files
with
258 additions
and
7 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
name: Space Filling Curve CI | ||
|
||
on: | ||
push: | ||
branches: | ||
- main | ||
- feature/* | ||
- dev/* | ||
- release/* | ||
pull_request: | ||
branches: | ||
- main | ||
|
||
jobs: | ||
test: | ||
|
||
runs-on: ubuntu-latest | ||
|
||
steps: | ||
- uses: actions/checkout@v2 | ||
- name: Run tests | ||
run: sbt -mem 2048 test | ||
env: | ||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Binary file not shown.
Binary file added
BIN
+32 Bytes
...ces/mixed_binary/.part-00000-9fd8ef85-048d-4d38-b167-7b9ee0f5d076-c000.snappy.parquet.crc
Binary file not shown.
Empty file.
Binary file added
BIN
+2.76 KB
...esources/mixed_binary/part-00000-9fd8ef85-048d-4d38-b167-7b9ee0f5d076-c000.snappy.parquet
Binary file not shown.
Binary file not shown.
Binary file added
BIN
+28 Bytes
...s/numeric_binary/.part-00000-23538a43-13e8-4916-be8b-42a275799d26-c000.snappy.parquet.crc
Binary file not shown.
Empty file.
Binary file added
BIN
+2.22 KB
...ources/numeric_binary/part-00000-23538a43-13e8-4916-be8b-42a275799d26-c000.snappy.parquet
Binary file not shown.
Binary file not shown.
Binary file added
BIN
+28 Bytes
...urces/str_binary/.part-00000-9c5d0d00-b2d6-452b-92a3-4a50f747eecb-c000.snappy.parquet.crc
Binary file not shown.
Empty file.
Binary file added
BIN
+2.23 KB
.../resources/str_binary/part-00000-9c5d0d00-b2d6-452b-92a3-4a50f747eecb-c000.snappy.parquet
Binary file not shown.
Binary file not shown.
Binary file added
BIN
+28 Bytes
...esources/z_index/.part-00000-ab3411a5-be7b-4049-a358-3347a356908b-c000.snappy.parquet.crc
Binary file not shown.
Empty file.
Binary file added
BIN
+2.01 KB
...est/resources/z_index/part-00000-ab3411a5-be7b-4049-a358-3347a356908b-c000.snappy.parquet
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
55 changes: 55 additions & 0 deletions
55
src/test/scala/io/dustinsmith/spacefillingcurves/HashDataFrame.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,55 @@ | ||
/* | ||
* Copyright 2021 DustinSmith.Io. All Rights Reserved. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"). You may not | ||
* use this file except in compliance with the License. A copy of the License | ||
* is located at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* or in the "license" file accompanying this file. This file is distributed on | ||
* an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either | ||
* express or implied. See the License for the specific language governing | ||
* permissions and limitations under the License. | ||
* | ||
*/ | ||
package io.dustinsmith.spacefillingcurves | ||
|
||
import scala.util.hashing.MurmurHash3 | ||
|
||
import org.apache.spark.sql.functions.{col, hash} | ||
|
||
/** | ||
* Helper object to compare dataframes without using Holdenkarau (has some problems with Spark 3) | ||
*/ | ||
object HashDataFrame extends SparkSessionWrapper { | ||
import spark.implicits._ | ||
|
||
/** | ||
* Computes a checksum on the entire contents of the supplied DataFrame. Checksum values can be used to confirm that | ||
* dataframe contents are unchanged after operations that MUST NOT alter actual data | ||
* (e.g. HDFS leaf file compaction, etc) | ||
* | ||
* This method builds hierarchichal hashes (from row hashes -> RDD partition hashes -> to a final DF hash) which | ||
* makes it relatively inexpensive compared to other ways of comparing dataframes (e.g. joins, minus, etc). | ||
* It can be used even for very large data/paths. | ||
* Credit to https://github.com/beljun for this method | ||
* | ||
* @param df Dataframe to compute the checksum for. | ||
* @param numParts Level of parallelism. Note that checksum value changes with different numParts value so it should | ||
* remain the same across comparisons. | ||
* @return Checksum for dataframe. | ||
*/ | ||
def checksumDataFrame(df: org.apache.spark.sql.DataFrame, numParts: Int): Int = { | ||
|
||
MurmurHash3.orderedHash( | ||
df | ||
.select(hash(df.columns.map(col): _*).as("row_hash")) | ||
.repartition(numParts, $"row_hash") | ||
.sortWithinPartitions("row_hash") | ||
.mapPartitions(p => Array(MurmurHash3.orderedHash(p)).toIterator) | ||
.orderBy($"value") | ||
.collect() | ||
) | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters