You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
I am trying to export an out of memory dataframe to parquet as in the following code but i keep getting the following error.
Code:
'''
import numpy as np
from matplotlib import pyplot as plt
import vaex as vd
def custom_shift(df, column):
# Extract the values of the column
values = vd.from_arrays(column=df[f'{column}'].values)
# Create a new shifted column with None as the first value
d = vd.from_arrays(column=[None])
shifted_values = vd.concat([d, values[:-1]])
shifted_values = vd.from_arrays(ClosePrice_shifted=shifted_values).ClosePrice_shifted.values
df.add_column(f'{column}_shifted', shifted_values)
# Add the shifted values as a new column
return df
Error:
Traceback (most recent call last):
File "C:\Users\Intijir\PycharmProjects\quantStrategy\venv\lib\site-packages\vaex\scopes.py", line 113, in evaluate
result = self[expression]
File "C:\Users\Intijir\PycharmProjects\quantStrategy\venv\lib\site-packages\vaex\scopes.py", line 198, in getitem
raise KeyError("Unknown variables or column: %r" % (variable,))
KeyError: "Unknown variables or column: '((ClosePrice / ClosePrice_shifted) - 1)'"
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:\Users\Intijir\PycharmProjects\quantStrategy\venv\lib\site-packages\vaex\dataframe.py", line 2273, in data_type
data = self.evaluate(expression, 0, 1, filtered=False, array_type=array_type, parallel=False)
File "C:\Users\Intijir\PycharmProjects\quantStrategy\venv\lib\site-packages\vaex\dataframe.py", line 3095, in evaluate
return self._evaluate_implementation(expression, i1=i1, i2=i2, out=out, selection=selection, filtered=filtered, array_type=array_type, parallel=parallel, chunk_size=chunk_size, progress=progress)
File "C:\Users\Intijir\PycharmProjects\quantStrategy\venv\lib\site-packages\vaex\dataframe.py", line 6562, in _evaluate_implementation
value = block_scope.evaluate(expression)
File "C:\Users\Intijir\PycharmProjects\quantStrategy\venv\lib\site-packages\vaex\scopes.py", line 113, in evaluate
result = self[expression]
File "C:\Users\Intijir\PycharmProjects\quantStrategy\venv\lib\site-packages\vaex\scopes.py", line 188, in getitem
values = self.evaluate(expression)
File "C:\Users\Intijir\PycharmProjects\quantStrategy\venv\lib\site-packages\vaex\scopes.py", line 119, in evaluate
result = eval(expression, expression_namespace, self)
File "", line 1, in
File "C:\Users\Intijir\PycharmProjects\quantStrategy\venv\lib\site-packages\vaex\arrow\numpy_dispatch.py", line 74, in operator
result_data = a.add_missing(result_data)
File "C:\Users\Intijir\PycharmProjects\quantStrategy\venv\lib\site-packages\vaex\arrow\numpy_dispatch.py", line 27, in add_missing
ar = vaex.array_types.to_arrow(ar)
File "C:\Users\Intijir\PycharmProjects\quantStrategy\venv\lib\site-packages\vaex\array_types.py", line 184, in to_arrow
return pa.array(x)
File "pyarrow\array.pxi", line 340, in pyarrow.lib.array
File "pyarrow\array.pxi", line 86, in pyarrow.lib._ndarray_to_array
File "pyarrow\error.pxi", line 91, in pyarrow.lib.check_status
pyarrow.lib.ArrowInvalid: only handle 1-dimensional arrays
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:\Users\Intijir\PycharmProjects\quantStrategy\Data\Labeling\cumulative_sum.py", line 89, in
ddf.export_parquet('dollar_bars_threshold.parquet', engine='pyarrow')
File "C:\Users\Intijir\PycharmProjects\quantStrategy\venv\lib\site-packages\vaex\dataframe.py", line 6823, in export_parquet
schema = self.schema_arrow(reduce_large=True)
File "C:\Users\Intijir\PycharmProjects\quantStrategy\venv\lib\site-packages\vaex\dataframe.py", line 2335, in schema_arrow
return pa.schema({name: reduce(dtype.arrow) for name, dtype in self.schema().items()})
File "C:\Users\Intijir\PycharmProjects\quantStrategy\venv\lib\site-packages\vaex\dataframe.py", line 2323, in schema
return {column_name:self.data_type(column_name) for column_name in self.get_column_names()}
File "C:\Users\Intijir\PycharmProjects\quantStrategy\venv\lib\site-packages\vaex\dataframe.py", line 2323, in
return {column_name:self.data_type(column_name) for column_name in self.get_column_names()}
File "C:\Users\Intijir\PycharmProjects\quantStrategy\venv\lib\site-packages\vaex\dataframe.py", line 2275, in data_type
data = self.evaluate(expression, 0, 1, filtered=True, array_type=array_type, parallel=False)
File "C:\Users\Intijir\PycharmProjects\quantStrategy\venv\lib\site-packages\vaex\dataframe.py", line 3095, in evaluate
return self._evaluate_implementation(expression, i1=i1, i2=i2, out=out, selection=selection, filtered=filtered, array_type=array_type, parallel=parallel, chunk_size=chunk_size, progress=progress)
File "C:\Users\Intijir\PycharmProjects\quantStrategy\venv\lib\site-packages\vaex\dataframe.py", line 6402, in _evaluate_implementation
max_stop = (len(self) if (self.filtered and filtered) else self.length_unfiltered())
File "C:\Users\Intijir\PycharmProjects\quantStrategy\venv\lib\site-packages\vaex\dataframe.py", line 4326, in len
self._cached_filtered_length = int(self.count())
File "C:\Users\Intijir\PycharmProjects\quantStrategy\venv\lib\site-packages\vaex\dataframe.py", line 967, in count
return self._compute_agg('count', expression, binby, limits, shape, selection, delay, edges, progress, array_type=array_type)
File "C:\Users\Intijir\PycharmProjects\quantStrategy\venv\lib\site-packages\vaex\dataframe.py", line 941, in _compute_agg
return self._delay(delay, progressbar.exit_on(var))
File "C:\Users\Intijir\PycharmProjects\quantStrategy\venv\lib\site-packages\vaex\dataframe.py", line 1780, in _delay
self.execute()
File "C:\Users\Intijir\PycharmProjects\quantStrategy\venv\lib\site-packages\vaex\dataframe.py", line 421, in execute
self.executor.execute()
File "C:\Users\Intijir\PycharmProjects\quantStrategy\venv\lib\site-packages\vaex\execution.py", line 308, in execute
for _ in self.execute_generator():
File "C:\Users\Intijir\PycharmProjects\quantStrategy\venv\lib\site-packages\vaex\execution.py", line 432, in execute_generator
yield from self.thread_pool.map(self.process_part, dataset.chunk_iterator(run.dataset_deps, chunk_size),
File "C:\Users\Intijir\PycharmProjects\quantStrategy\venv\lib\site-packages\vaex\multithreading.py", line 100, in map
iterator = super(ThreadPoolIndex, self).map(wrapped, cancellable_iter())
File "C:\Users\Intijir\AppData\Local\Programs\Python\Python39\lib\concurrent\futures_base.py", line 598, in map
fs = [self.submit(fn, *args) for args in zip(*iterables)]
File "C:\Users\Intijir\AppData\Local\Programs\Python\Python39\lib\concurrent\futures_base.py", line 598, in
fs = [self.submit(fn, *args) for args in zip(*iterables)]
File "C:\Users\Intijir\PycharmProjects\quantStrategy\venv\lib\site-packages\vaex\multithreading.py", line 86, in cancellable_iter
for value in chunk_iterator:
File "C:\Users\Intijir\PycharmProjects\quantStrategy\venv\lib\site-packages\vaex\dataset.py", line 1257, in chunk_iterator
for (i1, i2, ichunks), (j1, j2, jchunks) in zip(
File "C:\Users\Intijir\PycharmProjects\quantStrategy\venv\lib\site-packages\vaex\arrow\dataset.py", line 182, in chunk_iterator
chunks = chunks_future.result()
File "C:\Users\Intijir\AppData\Local\Programs\Python\Python39\lib\concurrent\futures_base.py", line 446, in result
return self.__get_result()
File "C:\Users\Intijir\AppData\Local\Programs\Python\Python39\lib\concurrent\futures_base.py", line 391, in __get_result
raise self._exception
File "C:\Users\Intijir\AppData\Local\Programs\Python\Python39\lib\concurrent\futures\thread.py", line 58, in run
result = self.fn(*self.args, **self.kwargs)
File "C:\Users\Intijir\PycharmProjects\quantStrategy\venv\lib\site-packages\vaex\arrow\dataset.py", line 114, in reader
table = fragment.to_table(columns=list(columns_physical), use_threads=False)
File "pyarrow_dataset.pyx", line 1613, in pyarrow._dataset.Fragment.to_table
File "pyarrow_dataset.pyx", line 3713, in pyarrow._dataset.Scanner.to_table
File "pyarrow\error.pxi", line 154, in pyarrow.lib.pyarrow_internal_check_status
File "pyarrow\error.pxi", line 91, in pyarrow.lib.check_status
pyarrow.lib.ArrowMemoryError: malloc of size 8388608 failed
Thanks!
The text was updated successfully, but these errors were encountered:
Intijir
changed the title
Exporting out of memory dataframe to parquet out of memory issue
Exporting out of memory dataframe to parquet error
Apr 20, 2024
I am trying to export an out of memory dataframe to parquet as in the following code but i keep getting the following error.
Code:
'''
import numpy as np
from matplotlib import pyplot as plt
import vaex as vd
def custom_shift(df, column):
# Extract the values of the column
values = vd.from_arrays(column=df[f'{column}'].values)
# Create a new shifted column with None as the first value
d = vd.from_arrays(column=[None])
shifted_values = vd.concat([d, values[:-1]])
shifted_values = vd.from_arrays(ClosePrice_shifted=shifted_values).ClosePrice_shifted.values
df.add_column(f'{column}_shifted', shifted_values)
# Add the shifted values as a new column
return df
def get_threshold(daily_returns, lookback=40):
ewm_std = np.abs(daily_returns.rolling(window=lookback).std())
threshold = np.exp(ewm_std)
return threshold.mean() * 0.1
ddf = custom_shift(ddf, 'ClosePrice')
ddf = ddf.dropna()
ddf['daily_returns'] = ddf['ClosePrice'] / ddf['ClosePrice_shifted'] - 1
ddf['threshold'] = (ddf['daily_returns'].apply(get_threshold))
ddf.export_parquet('dollar_bars_threshold.parquet', engine='pyarrow')
'''
Error:
Traceback (most recent call last):
File "C:\Users\Intijir\PycharmProjects\quantStrategy\venv\lib\site-packages\vaex\scopes.py", line 113, in evaluate
result = self[expression]
File "C:\Users\Intijir\PycharmProjects\quantStrategy\venv\lib\site-packages\vaex\scopes.py", line 198, in getitem
raise KeyError("Unknown variables or column: %r" % (variable,))
KeyError: "Unknown variables or column: '((ClosePrice / ClosePrice_shifted) - 1)'"
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:\Users\Intijir\PycharmProjects\quantStrategy\venv\lib\site-packages\vaex\dataframe.py", line 2273, in data_type
data = self.evaluate(expression, 0, 1, filtered=False, array_type=array_type, parallel=False)
File "C:\Users\Intijir\PycharmProjects\quantStrategy\venv\lib\site-packages\vaex\dataframe.py", line 3095, in evaluate
return self._evaluate_implementation(expression, i1=i1, i2=i2, out=out, selection=selection, filtered=filtered, array_type=array_type, parallel=parallel, chunk_size=chunk_size, progress=progress)
File "C:\Users\Intijir\PycharmProjects\quantStrategy\venv\lib\site-packages\vaex\dataframe.py", line 6562, in _evaluate_implementation
value = block_scope.evaluate(expression)
File "C:\Users\Intijir\PycharmProjects\quantStrategy\venv\lib\site-packages\vaex\scopes.py", line 113, in evaluate
result = self[expression]
File "C:\Users\Intijir\PycharmProjects\quantStrategy\venv\lib\site-packages\vaex\scopes.py", line 188, in getitem
values = self.evaluate(expression)
File "C:\Users\Intijir\PycharmProjects\quantStrategy\venv\lib\site-packages\vaex\scopes.py", line 119, in evaluate
result = eval(expression, expression_namespace, self)
File "", line 1, in
File "C:\Users\Intijir\PycharmProjects\quantStrategy\venv\lib\site-packages\vaex\arrow\numpy_dispatch.py", line 74, in operator
result_data = a.add_missing(result_data)
File "C:\Users\Intijir\PycharmProjects\quantStrategy\venv\lib\site-packages\vaex\arrow\numpy_dispatch.py", line 27, in add_missing
ar = vaex.array_types.to_arrow(ar)
File "C:\Users\Intijir\PycharmProjects\quantStrategy\venv\lib\site-packages\vaex\array_types.py", line 184, in to_arrow
return pa.array(x)
File "pyarrow\array.pxi", line 340, in pyarrow.lib.array
File "pyarrow\array.pxi", line 86, in pyarrow.lib._ndarray_to_array
File "pyarrow\error.pxi", line 91, in pyarrow.lib.check_status
pyarrow.lib.ArrowInvalid: only handle 1-dimensional arrays
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:\Users\Intijir\PycharmProjects\quantStrategy\Data\Labeling\cumulative_sum.py", line 89, in
ddf.export_parquet('dollar_bars_threshold.parquet', engine='pyarrow')
File "C:\Users\Intijir\PycharmProjects\quantStrategy\venv\lib\site-packages\vaex\dataframe.py", line 6823, in export_parquet
schema = self.schema_arrow(reduce_large=True)
File "C:\Users\Intijir\PycharmProjects\quantStrategy\venv\lib\site-packages\vaex\dataframe.py", line 2335, in schema_arrow
return pa.schema({name: reduce(dtype.arrow) for name, dtype in self.schema().items()})
File "C:\Users\Intijir\PycharmProjects\quantStrategy\venv\lib\site-packages\vaex\dataframe.py", line 2323, in schema
return {column_name:self.data_type(column_name) for column_name in self.get_column_names()}
File "C:\Users\Intijir\PycharmProjects\quantStrategy\venv\lib\site-packages\vaex\dataframe.py", line 2323, in
return {column_name:self.data_type(column_name) for column_name in self.get_column_names()}
File "C:\Users\Intijir\PycharmProjects\quantStrategy\venv\lib\site-packages\vaex\dataframe.py", line 2275, in data_type
data = self.evaluate(expression, 0, 1, filtered=True, array_type=array_type, parallel=False)
File "C:\Users\Intijir\PycharmProjects\quantStrategy\venv\lib\site-packages\vaex\dataframe.py", line 3095, in evaluate
return self._evaluate_implementation(expression, i1=i1, i2=i2, out=out, selection=selection, filtered=filtered, array_type=array_type, parallel=parallel, chunk_size=chunk_size, progress=progress)
File "C:\Users\Intijir\PycharmProjects\quantStrategy\venv\lib\site-packages\vaex\dataframe.py", line 6402, in _evaluate_implementation
max_stop = (len(self) if (self.filtered and filtered) else self.length_unfiltered())
File "C:\Users\Intijir\PycharmProjects\quantStrategy\venv\lib\site-packages\vaex\dataframe.py", line 4326, in len
self._cached_filtered_length = int(self.count())
File "C:\Users\Intijir\PycharmProjects\quantStrategy\venv\lib\site-packages\vaex\dataframe.py", line 967, in count
return self._compute_agg('count', expression, binby, limits, shape, selection, delay, edges, progress, array_type=array_type)
File "C:\Users\Intijir\PycharmProjects\quantStrategy\venv\lib\site-packages\vaex\dataframe.py", line 941, in _compute_agg
return self._delay(delay, progressbar.exit_on(var))
File "C:\Users\Intijir\PycharmProjects\quantStrategy\venv\lib\site-packages\vaex\dataframe.py", line 1780, in _delay
self.execute()
File "C:\Users\Intijir\PycharmProjects\quantStrategy\venv\lib\site-packages\vaex\dataframe.py", line 421, in execute
self.executor.execute()
File "C:\Users\Intijir\PycharmProjects\quantStrategy\venv\lib\site-packages\vaex\execution.py", line 308, in execute
for _ in self.execute_generator():
File "C:\Users\Intijir\PycharmProjects\quantStrategy\venv\lib\site-packages\vaex\execution.py", line 432, in execute_generator
yield from self.thread_pool.map(self.process_part, dataset.chunk_iterator(run.dataset_deps, chunk_size),
File "C:\Users\Intijir\PycharmProjects\quantStrategy\venv\lib\site-packages\vaex\multithreading.py", line 100, in map
iterator = super(ThreadPoolIndex, self).map(wrapped, cancellable_iter())
File "C:\Users\Intijir\AppData\Local\Programs\Python\Python39\lib\concurrent\futures_base.py", line 598, in map
fs = [self.submit(fn, *args) for args in zip(*iterables)]
File "C:\Users\Intijir\AppData\Local\Programs\Python\Python39\lib\concurrent\futures_base.py", line 598, in
fs = [self.submit(fn, *args) for args in zip(*iterables)]
File "C:\Users\Intijir\PycharmProjects\quantStrategy\venv\lib\site-packages\vaex\multithreading.py", line 86, in cancellable_iter
for value in chunk_iterator:
File "C:\Users\Intijir\PycharmProjects\quantStrategy\venv\lib\site-packages\vaex\dataset.py", line 1257, in chunk_iterator
for (i1, i2, ichunks), (j1, j2, jchunks) in zip(
File "C:\Users\Intijir\PycharmProjects\quantStrategy\venv\lib\site-packages\vaex\arrow\dataset.py", line 182, in chunk_iterator
chunks = chunks_future.result()
File "C:\Users\Intijir\AppData\Local\Programs\Python\Python39\lib\concurrent\futures_base.py", line 446, in result
return self.__get_result()
File "C:\Users\Intijir\AppData\Local\Programs\Python\Python39\lib\concurrent\futures_base.py", line 391, in __get_result
raise self._exception
File "C:\Users\Intijir\AppData\Local\Programs\Python\Python39\lib\concurrent\futures\thread.py", line 58, in run
result = self.fn(*self.args, **self.kwargs)
File "C:\Users\Intijir\PycharmProjects\quantStrategy\venv\lib\site-packages\vaex\arrow\dataset.py", line 114, in reader
table = fragment.to_table(columns=list(columns_physical), use_threads=False)
File "pyarrow_dataset.pyx", line 1613, in pyarrow._dataset.Fragment.to_table
File "pyarrow_dataset.pyx", line 3713, in pyarrow._dataset.Scanner.to_table
File "pyarrow\error.pxi", line 154, in pyarrow.lib.pyarrow_internal_check_status
File "pyarrow\error.pxi", line 91, in pyarrow.lib.check_status
pyarrow.lib.ArrowMemoryError: malloc of size 8388608 failed
Thanks!
The text was updated successfully, but these errors were encountered: