Skip to content

Commit

Permalink
fix: expand hash length to avoid collision (#17)
Browse files Browse the repository at this point in the history
  • Loading branch information
eakmanrq authored May 22, 2024
1 parent d1bf8a1 commit fa14af5
Show file tree
Hide file tree
Showing 4 changed files with 18 additions and 18 deletions.
2 changes: 1 addition & 1 deletion sqlframe/base/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -417,7 +417,7 @@ def _create_hash_from_expression(self, expression: exp.Expression) -> str:
from sqlframe.base.session import _BaseSession

value = expression.sql(dialect=_BaseSession().input_dialect).encode("utf-8")
hash = f"t{zlib.crc32(value)}"[:6]
hash = f"t{zlib.crc32(value)}"[:9]
return self.session._normalize_string(hash)

def _get_select_expressions(
Expand Down
20 changes: 10 additions & 10 deletions tests/unit/standalone/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

def test_hash_select_expression(standalone_employee: StandaloneDataFrame):
expression = exp.select("cola").from_("table")
assert standalone_employee._create_hash_from_expression(expression) == "t17051"
assert standalone_employee._create_hash_from_expression(expression) == "t17051938"


def test_columns(standalone_employee: StandaloneDataFrame):
Expand All @@ -20,29 +20,29 @@ def test_columns(standalone_employee: StandaloneDataFrame):
def test_cache(standalone_employee: StandaloneDataFrame, compare_sql: t.Callable):
df = standalone_employee.select("fname").cache()
expected_statements = [
"DROP VIEW IF EXISTS t31563",
"CACHE LAZY TABLE t31563 OPTIONS('storageLevel' = 'MEMORY_AND_DISK') AS SELECT CAST(`a1`.`fname` AS STRING) AS `fname` FROM VALUES (1, 'Jack', 'Shephard', 37, 1), (2, 'John', 'Locke', 65, 1), (3, 'Kate', 'Austen', 37, 2), (4, 'Claire', 'Littleton', 27, 2), (5, 'Hugo', 'Reyes', 29, 100) AS `a1`(`employee_id`, `fname`, `lname`, `age`, `store_id`)",
"SELECT `t31563`.`fname` AS `fname` FROM `t31563` AS `t31563`",
"DROP VIEW IF EXISTS t31563989",
"CACHE LAZY TABLE t31563989 OPTIONS('storageLevel' = 'MEMORY_AND_DISK') AS SELECT CAST(`a1`.`fname` AS STRING) AS `fname` FROM VALUES (1, 'Jack', 'Shephard', 37, 1), (2, 'John', 'Locke', 65, 1), (3, 'Kate', 'Austen', 37, 2), (4, 'Claire', 'Littleton', 27, 2), (5, 'Hugo', 'Reyes', 29, 100) AS `a1`(`employee_id`, `fname`, `lname`, `age`, `store_id`)",
"SELECT `t31563989`.`fname` AS `fname` FROM `t31563989` AS `t31563989`",
]
compare_sql(df, expected_statements)


def test_persist_default(standalone_employee: StandaloneDataFrame, compare_sql: t.Callable):
df = standalone_employee.select("fname").persist()
expected_statements = [
"DROP VIEW IF EXISTS t31563",
"CACHE LAZY TABLE t31563 OPTIONS('storageLevel' = 'MEMORY_AND_DISK_SER') AS SELECT CAST(`a1`.`fname` AS STRING) AS `fname` FROM VALUES (1, 'Jack', 'Shephard', 37, 1), (2, 'John', 'Locke', 65, 1), (3, 'Kate', 'Austen', 37, 2), (4, 'Claire', 'Littleton', 27, 2), (5, 'Hugo', 'Reyes', 29, 100) AS `a1`(`employee_id`, `fname`, `lname`, `age`, `store_id`)",
"SELECT `t31563`.`fname` AS `fname` FROM `t31563` AS `t31563`",
"DROP VIEW IF EXISTS t31563989",
"CACHE LAZY TABLE t31563989 OPTIONS('storageLevel' = 'MEMORY_AND_DISK_SER') AS SELECT CAST(`a1`.`fname` AS STRING) AS `fname` FROM VALUES (1, 'Jack', 'Shephard', 37, 1), (2, 'John', 'Locke', 65, 1), (3, 'Kate', 'Austen', 37, 2), (4, 'Claire', 'Littleton', 27, 2), (5, 'Hugo', 'Reyes', 29, 100) AS `a1`(`employee_id`, `fname`, `lname`, `age`, `store_id`)",
"SELECT `t31563989`.`fname` AS `fname` FROM `t31563989` AS `t31563989`",
]
compare_sql(df, expected_statements)


def test_persist_storagelevel(standalone_employee: StandaloneDataFrame, compare_sql: t.Callable):
df = standalone_employee.select("fname").persist("DISK_ONLY_2")
expected_statements = [
"DROP VIEW IF EXISTS t31563",
"CACHE LAZY TABLE t31563 OPTIONS('storageLevel' = 'DISK_ONLY_2') AS SELECT CAST(`a1`.`fname` AS STRING) AS `fname` FROM VALUES (1, 'Jack', 'Shephard', 37, 1), (2, 'John', 'Locke', 65, 1), (3, 'Kate', 'Austen', 37, 2), (4, 'Claire', 'Littleton', 27, 2), (5, 'Hugo', 'Reyes', 29, 100) AS `a1`(`employee_id`, `fname`, `lname`, `age`, `store_id`)",
"SELECT `t31563`.`fname` AS `fname` FROM `t31563` AS `t31563`",
"DROP VIEW IF EXISTS t31563989",
"CACHE LAZY TABLE t31563989 OPTIONS('storageLevel' = 'DISK_ONLY_2') AS SELECT CAST(`a1`.`fname` AS STRING) AS `fname` FROM VALUES (1, 'Jack', 'Shephard', 37, 1), (2, 'John', 'Locke', 65, 1), (3, 'Kate', 'Austen', 37, 2), (4, 'Claire', 'Littleton', 27, 2), (5, 'Hugo', 'Reyes', 29, 100) AS `a1`(`employee_id`, `fname`, `lname`, `age`, `store_id`)",
"SELECT `t31563989`.`fname` AS `fname` FROM `t31563989` AS `t31563989`",
]
compare_sql(df, expected_statements)

Expand Down
12 changes: 6 additions & 6 deletions tests/unit/standalone/test_dataframe_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,9 +43,9 @@ def test_insertInto_byName(standalone_employee: StandaloneDataFrame, compare_sql
def test_insertInto_cache(standalone_employee: StandaloneDataFrame, compare_sql: t.Callable):
df = standalone_employee.cache().write.insertInto("table_name")
expected_statements = [
"DROP VIEW IF EXISTS t12441",
"CACHE LAZY TABLE t12441 OPTIONS('storageLevel' = 'MEMORY_AND_DISK') AS SELECT `a1`.`employee_id` AS `employee_id`, CAST(`a1`.`fname` AS STRING) AS `fname`, CAST(`a1`.`lname` AS STRING) AS `lname`, `a1`.`age` AS `age`, `a1`.`store_id` AS `store_id` FROM VALUES (1, 'Jack', 'Shephard', 37, 1), (2, 'John', 'Locke', 65, 1), (3, 'Kate', 'Austen', 37, 2), (4, 'Claire', 'Littleton', 27, 2), (5, 'Hugo', 'Reyes', 29, 100) AS `a1`(`employee_id`, `fname`, `lname`, `age`, `store_id`)",
"INSERT INTO table_name SELECT `t12441`.`employee_id` AS `employee_id`, `t12441`.`fname` AS `fname`, `t12441`.`lname` AS `lname`, `t12441`.`age` AS `age`, `t12441`.`store_id` AS `store_id` FROM `t12441` AS `t12441`",
"DROP VIEW IF EXISTS t12441709",
"CACHE LAZY TABLE t12441709 OPTIONS('storageLevel' = 'MEMORY_AND_DISK') AS SELECT `a1`.`employee_id` AS `employee_id`, CAST(`a1`.`fname` AS STRING) AS `fname`, CAST(`a1`.`lname` AS STRING) AS `lname`, `a1`.`age` AS `age`, `a1`.`store_id` AS `store_id` FROM VALUES (1, 'Jack', 'Shephard', 37, 1), (2, 'John', 'Locke', 65, 1), (3, 'Kate', 'Austen', 37, 2), (4, 'Claire', 'Littleton', 27, 2), (5, 'Hugo', 'Reyes', 29, 100) AS `a1`(`employee_id`, `fname`, `lname`, `age`, `store_id`)",
"INSERT INTO table_name SELECT `t12441709`.`employee_id` AS `employee_id`, `t12441709`.`fname` AS `fname`, `t12441709`.`lname` AS `lname`, `t12441709`.`age` AS `age`, `t12441709`.`store_id` AS `store_id` FROM `t12441709` AS `t12441709`",
]
compare_sql(df, expected_statements)

Expand Down Expand Up @@ -94,9 +94,9 @@ def test_mode_override(standalone_employee: StandaloneDataFrame, compare_sql: t.
def test_saveAsTable_cache(standalone_employee: StandaloneDataFrame, compare_sql: t.Callable):
df = standalone_employee.cache().write.saveAsTable("table_name")
expected_statements = [
"DROP VIEW IF EXISTS t12441",
"CACHE LAZY TABLE t12441 OPTIONS('storageLevel' = 'MEMORY_AND_DISK') AS SELECT `a1`.`employee_id` AS `employee_id`, CAST(`a1`.`fname` AS STRING) AS `fname`, CAST(`a1`.`lname` AS STRING) AS `lname`, `a1`.`age` AS `age`, `a1`.`store_id` AS `store_id` FROM VALUES (1, 'Jack', 'Shephard', 37, 1), (2, 'John', 'Locke', 65, 1), (3, 'Kate', 'Austen', 37, 2), (4, 'Claire', 'Littleton', 27, 2), (5, 'Hugo', 'Reyes', 29, 100) AS `a1`(`employee_id`, `fname`, `lname`, `age`, `store_id`)",
"CREATE TABLE table_name AS SELECT `t12441`.`employee_id` AS `employee_id`, `t12441`.`fname` AS `fname`, `t12441`.`lname` AS `lname`, `t12441`.`age` AS `age`, `t12441`.`store_id` AS `store_id` FROM `t12441` AS `t12441`",
"DROP VIEW IF EXISTS t12441709",
"CACHE LAZY TABLE t12441709 OPTIONS('storageLevel' = 'MEMORY_AND_DISK') AS SELECT `a1`.`employee_id` AS `employee_id`, CAST(`a1`.`fname` AS STRING) AS `fname`, CAST(`a1`.`lname` AS STRING) AS `lname`, `a1`.`age` AS `age`, `a1`.`store_id` AS `store_id` FROM VALUES (1, 'Jack', 'Shephard', 37, 1), (2, 'John', 'Locke', 65, 1), (3, 'Kate', 'Austen', 37, 2), (4, 'Claire', 'Littleton', 27, 2), (5, 'Hugo', 'Reyes', 29, 100) AS `a1`(`employee_id`, `fname`, `lname`, `age`, `store_id`)",
"CREATE TABLE table_name AS SELECT `t12441709`.`employee_id` AS `employee_id`, `t12441709`.`fname` AS `fname`, `t12441709`.`lname` AS `lname`, `t12441709`.`age` AS `age`, `t12441709`.`store_id` AS `store_id` FROM `t12441709` AS `t12441709`",
]
compare_sql(df, expected_statements)

Expand Down
2 changes: 1 addition & 1 deletion tests/unit/standalone/test_session.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ def test_sql_with_aggs(standalone_session: StandaloneSession, compare_sql: t.Cal
df = standalone_session.sql(query).groupBy(F.col("cola")).agg(F.sum("colb"))
compare_sql(
df,
"WITH t26614 AS (SELECT `table`.`cola` AS `cola`, `table`.`colb` AS `colb` FROM `table` AS `table`), t23454 AS (SELECT cola, colb FROM t26614) SELECT cola, SUM(colb) FROM t23454 GROUP BY cola",
"WITH t26614157 AS (SELECT `table`.`cola` AS `cola`, `table`.`colb` AS `colb` FROM `table` AS `table`), t38889420 AS (SELECT cola, colb FROM t26614157) SELECT cola, SUM(colb) FROM t38889420 GROUP BY cola",
pretty=False,
optimize=False,
)
Expand Down

0 comments on commit fa14af5

Please sign in to comment.