You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
WITH frequencies AS (
SELECT COUNT(*) AS frequency
FROM <table_name>
WHERE column_A IS NOT NULL AND column_B IS NOT NULL
GROUP BY column_A, column_B)
SELECT COUNT(*)
FROM frequencies
WHERE frequency > 1
| Caused by: java.lang.OutOfMemoryError: Java heap space
[2024-10-08, 01:06:16 UTC] {log.py:112} ERROR - | Stacktrace:
[2024-10-08, 01:06:16 UTC] {log.py:113} ERROR - | Traceback (most recent call last):
| File "/home//_venv/lib/python3.11/site-packages/soda/scan.py", line 516, in execute
| data_source_scan.execute_queries()
| File "/home//_venv/lib/python3.11/site-packages/soda/execution/data_source_scan.py", line 72, in execute_queries
| query.execute()
| File "/home//_venv/lib/python3.11/site-packages/soda/execution/query/duplicates_query.py", line 73, in execute
| self.fetchone()
| File "/home//_venv/lib/python3.11/site-packages/soda/execution/query/query.py", line 166, in fetchone
| self.row = cursor.fetchone()
| ^^^^^^^^^^^^^^^^^
| File "/home//_venv/lib/python3.11/site-packages/soda/data_sources/spark_df_cursor.py", line 40, in fetchone
| spark_rows: list[Row] = self.df.collect()
| ^^^^^^^^^^^^^^^^^
| File "/home//_venv/lib/python3.11/site-packages/pyspark/sql/dataframe.py", line 1263, in collect
| sock_info = self._jdf.collectToPython()
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^
| File "/home//_venv/lib/python3.11/site-packages/py4j/java_gateway.py", line 1322, in call
| return_value = get_return_value(
| ^^^^^^^^^^^^^^^^^
| File "/home//_venv/lib/python3.11/site-packages/pyspark/errors/exceptions/captured.py", line 179, in deco
| return f(a, kw)
| ^^^^^^^^^^^
| File "/home//***_venv/lib/python3.11/site-packages/py4j/protocol.py", line 326, in get_return_value
| raise Py4JJavaError(
| py4j.protocol.Py4JJavaError: An error occurred while calling o256.collectToPython.
The text was updated successfully, but these errors were encountered:
Dataframe size: 2.33Gb
Check: duplicate_count(column_A, column_B) = 0
| Caused by: java.lang.OutOfMemoryError: Java heap space
[2024-10-08, 01:06:16 UTC] {log.py:112} ERROR - | Stacktrace:
[2024-10-08, 01:06:16 UTC] {log.py:113} ERROR - | Traceback (most recent call last):
| File "/home//_venv/lib/python3.11/site-packages/soda/scan.py", line 516, in execute
| data_source_scan.execute_queries()
| File "/home//_venv/lib/python3.11/site-packages/soda/execution/data_source_scan.py", line 72, in execute_queries
| query.execute()
| File "/home//_venv/lib/python3.11/site-packages/soda/execution/query/duplicates_query.py", line 73, in execute
| self.fetchone()
| File "/home//_venv/lib/python3.11/site-packages/soda/execution/query/query.py", line 166, in fetchone
| self.row = cursor.fetchone()
| ^^^^^^^^^^^^^^^^^
| File "/home//_venv/lib/python3.11/site-packages/soda/data_sources/spark_df_cursor.py", line 40, in fetchone
| spark_rows: list[Row] = self.df.collect()
| ^^^^^^^^^^^^^^^^^
| File "/home//_venv/lib/python3.11/site-packages/pyspark/sql/dataframe.py", line 1263, in collect
| sock_info = self._jdf.collectToPython()
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^
| File "/home//_venv/lib/python3.11/site-packages/py4j/java_gateway.py", line 1322, in call
| return_value = get_return_value(
| ^^^^^^^^^^^^^^^^^
| File "/home//_venv/lib/python3.11/site-packages/pyspark/errors/exceptions/captured.py", line 179, in deco
| return f(a, kw)
| ^^^^^^^^^^^
| File "/home//***_venv/lib/python3.11/site-packages/py4j/protocol.py", line 326, in get_return_value
| raise Py4JJavaError(
| py4j.protocol.Py4JJavaError: An error occurred while calling o256.collectToPython.
The text was updated successfully, but these errors were encountered: