From 8e4319928a4b28a99c505ff4b5e7767e603ae414 Mon Sep 17 00:00:00 2001 From: Gaetano Guerriero Date: Fri, 28 Jan 2022 11:27:37 +0100 Subject: [PATCH 1/2] better support for virtual columns --- pydruid/client.py | 8 +++++++ pydruid/query.py | 9 ++++++++ pydruid/utils/virtualcolumns.py | 28 ++++++++++++++++++++++ tests/test_query.py | 32 ++++++++++++++++++++++++-- tests/utils/test_virtualcolumns.py | 37 ++++++++++++++++++++++++++++++ 5 files changed, 112 insertions(+), 2 deletions(-) create mode 100644 pydruid/utils/virtualcolumns.py create mode 100644 tests/utils/test_virtualcolumns.py diff --git a/pydruid/client.py b/pydruid/client.py index ae51eab7..5757aaac 100755 --- a/pydruid/client.py +++ b/pydruid/client.py @@ -89,6 +89,8 @@ def topn(self, **kwargs): :param str dimension: Dimension to run the query against :param str metric: Metric over which to sort the specified dimension by :param int threshold: How many of the top items to return + :param list virtual_columns: A list of VirtualColumnSpec or dict to add + to the query :return: The query result :rtype: Query @@ -137,6 +139,8 @@ def timeseries(self, **kwargs): :type intervals: str or list :param dict aggregations: A map from aggregator name to one of the ``pydruid.utils.aggregators`` e.g., ``doublesum`` + :param list virtual_columns: A list of VirtualColumnSpec or dict to add + to the query :return: The query result :rtype: Query @@ -240,6 +244,8 @@ def groupby(self, **kwargs): :param dict aggregations: A map from aggregator name to one of the ``pydruid.utils.aggregators`` e.g., ``doublesum`` :param list dimensions: The dimensions to group by + :param list virtual_columns: A list of VirtualColumnSpec or dict to add + to the query :return: The query result :rtype: Query @@ -599,6 +605,8 @@ def scan(self, **kwargs): empty, all columns are returned :param list metrics: The list of metrics to select. If left empty, all metrics are returned + :param list virtual_columns: A list of VirtualColumnSpec or dict to add + to the query :param dict context: A dict of query context options :return: The query result diff --git a/pydruid/query.py b/pydruid/query.py index 09ec3260..3fea03f2 100644 --- a/pydruid/query.py +++ b/pydruid/query.py @@ -23,6 +23,7 @@ from pydruid.utils.having import Having from pydruid.utils.postaggregator import Postaggregator from pydruid.utils.query_utils import UnicodeWriter +from pydruid.utils.virtualcolumns import build_virtual_column class Query(MutableSequence): @@ -310,6 +311,10 @@ def build_query(self, query_type, args): query_dict[key] = build_dimension(val) elif key == "dimensions": query_dict[key] = [build_dimension(v) for v in val] + elif key in ("virtualColumns", "virtual_columns"): + query_dict["virtualColumns"] = [ + build_virtual_column(virtual_col) for virtual_col in val + ] else: query_dict[key] = val @@ -341,6 +346,7 @@ def topn(self, args): "threshold", "metric", "virtualColumns", + "virtual_columns", ] self.validate_query(query_type, valid_parts, args) return self.build_query(query_type, args) @@ -365,6 +371,7 @@ def timeseries(self, args): "post_aggregations", "intervals", "virtualColumns", + "virtual_columns", ] self.validate_query(query_type, valid_parts, args) return self.build_query(query_type, args) @@ -391,6 +398,7 @@ def groupby(self, args): "dimensions", "limit_spec", "virtualColumns", + "virtual_columns", ] self.validate_query(query_type, valid_parts, args) return self.build_query(query_type, args) @@ -516,6 +524,7 @@ def scan(self, args): "intervals", "limit", "order", + "virtual_columns", ] self.validate_query(query_type, valid_parts, args) return self.build_query(query_type, args) diff --git a/pydruid/utils/virtualcolumns.py b/pydruid/utils/virtualcolumns.py new file mode 100644 index 00000000..14465538 --- /dev/null +++ b/pydruid/utils/virtualcolumns.py @@ -0,0 +1,28 @@ +from __future__ import annotations + +from typing import Union + + +def build_virtual_column(virtual_column: Union[VirtualColumnSpec, dict]) -> dict: + if isinstance(virtual_column, VirtualColumnSpec): + return virtual_column.build() + + return virtual_column + + +class VirtualColumnSpec: + def __init__(self, name: str, expression: str, output_type: str = None): + self._name = name + self._expression = expression + self._output_type = output_type + + def build(self) -> dict: + build = { + "type": "expression", + "name": self._name, + "expression": self._expression, + } + if self._output_type is not None: + build["outputType"] = self._output_type + + return build diff --git a/tests/test_query.py b/tests/test_query.py index dbeef448..ea889634 100644 --- a/tests/test_query.py +++ b/tests/test_query.py @@ -1,4 +1,4 @@ -# -*- coding: UTF-8 -*- +# -*- coding: utf-8 -*- # # Copyright 2016 Metamarkets Group Inc. # @@ -23,7 +23,7 @@ from pandas.testing import assert_frame_equal from pydruid.query import Query, QueryBuilder -from pydruid.utils import aggregators, filters, having, postaggregator +from pydruid.utils import aggregators, filters, having, postaggregator, virtualcolumns def create_query_with_results(): @@ -226,6 +226,34 @@ def test_build_subquery(self): # then assert subquery_dict == expected_query_dict + def test_topn_virtual_columns(self) -> None: + builder = QueryBuilder() + + query_kwargs = { + "datasource": "things", + "intervals": "2013-10-23/2013-10-26", + "granularity": "all", + "dimension": "virtual_id", + "threshold": 5, + "metric": "views", + "virtual_columns": [ + virtualcolumns.VirtualColumnSpec( + "virtual_id", "concat('ns:' + real_id)", 'STRING' + ) + ] + } + + query = builder.topn(query_kwargs) + assert query.query_dict['queryType'] == 'topN' + + expected_virtual_columns = [ + { + 'type': 'expression', 'name': 'virtual_id', + 'expression': "concat('ns:' + real_id)", 'outputType': 'STRING' + } + ] + assert query.query_dict['virtualColumns'] == expected_virtual_columns + class TestQuery: def test_export_tsv(self, tmpdir): diff --git a/tests/utils/test_virtualcolumns.py b/tests/utils/test_virtualcolumns.py new file mode 100644 index 00000000..64849b85 --- /dev/null +++ b/tests/utils/test_virtualcolumns.py @@ -0,0 +1,37 @@ +from pydruid.utils.virtualcolumns import build_virtual_column, VirtualColumnSpec + + +class BuildVirtualColumnTC: + def test_with_virtual_column_spec(self) -> None: + virtual_col_spec = VirtualColumnSpec("foo", "bar + 3") + assert build_virtual_column(virtual_col_spec) == virtual_col_spec.build() + + def test_with_dict(self) -> None: + virtual_column = { + "type": "expression", + "name": "doubleVote", + "expression": "vote * 2", + "outputType": "LONG", + } + assert build_virtual_column(virtual_column) == virtual_column + + +class TestVirtualColumnSpec: + def test_default(self) -> None: + spec = VirtualColumnSpec("foo", "concat(bar + '123')") + expected = { + "type": "expression", + "name": "foo", + "expression": "concat(bar + '123')", + } + assert spec.build() == expected + + def test_output_type(self) -> None: + spec = VirtualColumnSpec("foo", "bar * 3", "LONG") + expected = { + "type": "expression", + "name": "foo", + "expression": "bar * 3", + "outputType": "LONG", + } + assert spec.build() == expected From 0eb7f64bce119f37de8501dc23391eb10ba32a87 Mon Sep 17 00:00:00 2001 From: Gaetano Guerriero Date: Fri, 28 Jan 2022 11:50:17 +0100 Subject: [PATCH 2/2] add missing urllib submodules imports --- pydruid/client.py | 3 ++- tests/test_client.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/pydruid/client.py b/pydruid/client.py index 5757aaac..9c058321 100755 --- a/pydruid/client.py +++ b/pydruid/client.py @@ -15,7 +15,8 @@ # import json import re -import urllib +import urllib.error +import urllib.request from base64 import b64encode from pydruid.query import QueryBuilder diff --git a/tests/test_client.py b/tests/test_client.py index 440a9204..5894cad2 100644 --- a/tests/test_client.py +++ b/tests/test_client.py @@ -1,6 +1,6 @@ # -*- coding: UTF-8 -*- import textwrap -import urllib +import urllib.error from io import StringIO from unittest.mock import Mock, patch