From e7d53cdb04403890884a95051708b6b781506e01 Mon Sep 17 00:00:00 2001 From: Caglar Demir Date: Wed, 3 Nov 2021 11:02:29 +0100 Subject: [PATCH] The data converion is explained in the REAME.md --- README.md | 56 +++++++++++++++++++++++++++---------- examples/sklearn_example.py | 2 ++ vectograph/quantizer.py | 23 ++++++++++----- 3 files changed, 59 insertions(+), 22 deletions(-) diff --git a/README.md b/README.md index b7b4e49..5c6e491 100644 --- a/README.md +++ b/README.md @@ -2,10 +2,39 @@ Vectograph is an open-source software library for automatically creating a graph structured data from a given tabular data. +- [Creating Structured Data from Tabular Data](#creating-structured-data-from-tabular-data) - [Installation](#installation) +- [Examples](#examples) -# Installation -### Installation from source +## Creating Structured Data from Tabular Data +Let **X** be a **m** by **n** matrix representing the input tabular, the structured data is created by following these steps: +1. Apply [QCUT algorithm](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.qcut.html) for each column that has at least **min_unique_val_per_column** number of unique values. +2. Consider + 1. **the i.th row** as the i.th [concise bounded description](https://www.w3.org/Submission/CBD/) of **the i.th event**. + 2. **the j.th column** as the j.th relation/predicate/edge. + 3. A triple is modeled as event_i -> relation_j -> **X_ij**. + +Assume that we have the first row of [fetch_california_housing](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.fetch_california_housing.html) is +``` +[ 8.3252 41. 6.98412698 1.02380952 322. 2.55555556 37.88 -122.23 ] +``` +Applying the QCUT algorithm with default parameters **min_unique_val_per_column=6, num_quantile=5** generates 0.th CBD +``` + <0_quantile_4> . + <1_quantile_4> . + <2_quantile_4> . + <3_quantile_1> . + <4_quantile_0> . + <5_quantile_1> . + <6_quantile_4> . + <7_quantile_0> . +``` +that consist of **n** triples. +`````` represents the 0.th relation, i.e., 0.th column, whereas ```<0_quantile_4>``` represents a tail entity +, i.e., the 4.th bin of the 0.th column of the tabular data. . After the data conversion, we store each bin values. For instance, running examples/sklearn_example.py generates ```Feature_Category_0_Mapping.csv``` that indicates +```0_quantile_4``` corresponds a bin that cover all values greater or equal than **5.10972**. + +## Installation ``` git clone https://github.com/dice-group/Vectograph.git conda create -n temp python=3.6 # Or be sure that your have Python => 3.6. @@ -14,20 +43,9 @@ pip install -e . python -c "import vectograph" python -m pytest tests ``` -### Installation via pip (later) -``` -pip install vectograph # only a placeholder -``` -### Scripting Examples - -### Using vectograph -Create a toy dataset via sklearn. Available datasets: boston, iris, diabetes, digits, wine, and breast_cancer. -```bash -python create_toy_data.py --toy_dataset_name "boston" -# Discretize each column having at least 12 unique values into 10 quantiles, otherwise do nothing -python main.py --tabularpath "boston.csv" --kg_name "boston.nt" --num_quantile=10 --min_unique_val_per_column=12 -``` +## Examples +#### API Example ```python from vectograph.transformers import GraphGenerator from vectograph.quantizer import QCUT @@ -44,6 +62,14 @@ for s, p, o in kg: print(s, p, o) ``` +### Scripting Example +Create a toy dataset via sklearn. Available datasets: boston, iris, diabetes, digits, wine, and breast_cancer. +```bash +python create_toy_data.py --toy_dataset_name "boston" +# Discretize each column having at least 12 unique values into 10 quantiles, otherwise do nothing +python main.py --tabularpath "boston.csv" --kg_name "boston.nt" --num_quantile=10 --min_unique_val_per_column=12 +``` + ### Scripting Vectograph & [DAIKIRI-Embedding](https://github.com/dice-group/DAIKIRI-Embedding) From a tabular data to knowledge graph embeddings ```bash diff --git a/examples/sklearn_example.py b/examples/sklearn_example.py index f1ef298..4c46955 100644 --- a/examples/sklearn_example.py +++ b/examples/sklearn_example.py @@ -15,6 +15,8 @@ from sklearn import datasets X, y = datasets.fetch_california_housing(return_X_y=True) +print(X[0]) +exit(1) X_transformed = QCUT(min_unique_val_per_column=6, num_quantile=5).transform(pd.DataFrame(X)) # Add prefix X_transformed.index = 'Event_' + X_transformed.index.astype(str) diff --git a/vectograph/quantizer.py b/vectograph/quantizer.py index 13037bb..8089a0b 100644 --- a/vectograph/quantizer.py +++ b/vectograph/quantizer.py @@ -79,14 +79,23 @@ def __perform_discretization(self, column_name: str, df: pd.DataFrame): # 3. Generate placeholders. labels = [column_name + '_quantile_' + str(i) for i in range(self.num_quantile)] - - # 4. discretize - discretized, bin_values = pd.qcut(x=df[column_name], q=self.num_quantile, retbins=True, labels=labels, - duplicates=self.duplicates) - # 5. if column contrians ***/*** => # substring: from the index of last / till the end. + # 4. Apply the Quantile-based discretization function + try: + discretized, bin_values = pd.qcut(x=df[column_name], q=self.num_quantile, retbins=True, labels=labels, + duplicates=self.duplicates) + except ValueError as e: + print('#' * 10, end=' ') + print(f'Error at applying Quantile-based discretization function (https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.qcut.html)') + print(f'Number of quantiles per column/feature: {self.num_quantile} ') + print(f'Number of unique values of the column/feature {column_name}: {len(df[column_name].unique())}') + print( + f'Either reduce the number of quantile parameter or set the duplicates parameter to ***drop*** (currently {self.duplicates})') + raise e + # 5. if column contains ***/*** => # substring: from the index of last / till the end. name_file = column_name[column_name.rfind('/') + 1:] - - pd.DataFrame.from_dict(dict(zip(discretized.cat.categories.tolist(), bin_values)), orient='index').to_csv(self.path + '/Feature_Category_' + name_file + '_Mapping.csv') + # 6. Save the mappings from bins to values. + pd.DataFrame.from_dict(dict(zip(discretized.cat.categories.tolist(), bin_values)), orient='index').to_csv( + self.path + '/Feature_Category_' + name_file + '_Mapping.csv') return 'Feature_Category_' + column_name, discretized, bin_values def transform(self, df: pd.DataFrame):