cmu-db · ETHenzlere · Jun 30, 2024 · Jun 30, 2024 · Jun 30, 2024 · Jul 15, 2024
diff --git a/.github/workflows/maven.yml b/.github/workflows/maven.yml
@@ -328,13 +328,12 @@ jobs:
           if [[ ${{matrix.benchmark}} == templated ]]; then
             java -jar benchbase.jar -b tpcc -c config/mysql/sample_tpcc_config.xml --create=true --load=true --execute=false --json-histograms results/histograms.json
             java -jar benchbase.jar -b ${{matrix.benchmark}} -c config/mysql/sample_${{matrix.benchmark}}_config.xml --create=false --load=false --execute=true --json-histograms results/histograms.json
-          # For anonymization, we load tpcc and anonymize a single table. The workload itself is not executed
-          # FIXME: 'exit 0' is called because there is no benchmark executed and analyzed. Must be removed once the Anonymization script is
-          # fully implemented. See Pull Request 455.
+          # For anonymization, we load tpcc and anonymize a single table.
           elif [[ ${{matrix.benchmark}} == anonymization ]]; then
             java -jar benchbase.jar -b tpcc -c config/mysql/sample_tpcc_config.xml --create=true --load=true --execute=false --json-histograms results/histograms.json
             java -jar benchbase.jar -b tpcc -c config/mysql/sample_${{matrix.benchmark}}_config.xml --anonymize=true
-            exit 0
+            mysql -h127.0.0.1 -P$MYSQL_PORT -uadmin -ppassword benchbase -e "RENAME TABLE item TO item_original,item_anonymized TO item;"
+            java -jar benchbase.jar -b tpcc -c config/mysql/sample_tpcc_config.xml --execute=true --json-histograms results/histograms.json
           elif [[ ${{matrix.benchmark}} == tpcc-with-reconnects ]]; then
             # See Also: WITH_SERVICE_INTERRUPTIONS=true docker/build-run-benchmark-with-docker.sh
             java -jar benchbase.jar -b tpcc -c config/mysql/sample_tpcc_config.xml --create=true --load=true
@@ -356,6 +355,8 @@ jobs:
           elif [ ${{matrix.benchmark}} == tpcc-with-reconnects ]; then
               ERRORS_THRESHOLD=0.02
               results_benchmark=tpcc
+          elif [ ${{matrix.benchmark}} == anonymization ]; then
+              results_benchmark=tpcc
           fi
           ./scripts/check_latest_benchmark_results.sh $results_benchmark
           ./scripts/check_histogram_results.sh results/histograms.json $ERRORS_THRESHOLD
@@ -460,7 +461,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        benchmark: [ 'auctionmark', 'chbenchmark', 'epinions', 'hyadapt', 'noop', 'otmetrics', 'resourcestresser', 'seats', 'sibench', 'smallbank', 'tatp', 'templated', 'tpcc', 'tpcc-with-reconnects', 'tpch', 'twitter', 'voter', 'wikipedia', 'ycsb' ]
+        benchmark: [ 'anonymization', 'auctionmark', 'chbenchmark', 'epinions', 'hyadapt', 'noop', 'otmetrics', 'resourcestresser', 'seats', 'sibench', 'smallbank', 'tatp', 'templated', 'tpcc', 'tpcc-with-reconnects', 'tpch', 'twitter', 'voter', 'wikipedia', 'ycsb' ]
     steps:
       # Note: we download just the docker-compose scripts/configs rather than the
       # whole source code repo for better testing.
@@ -499,6 +500,21 @@ jobs:
           java-version: ${{env.JAVA_VERSION}}
           distribution: 'temurin'
 
+      - name: Set up Python
+        uses: actions/setup-python@v5  
+        with:  
+          python-version: ${{env.PYTHON_VERSION}}  
+
+      - name: Install Python dependencies  
+        working-directory: ./scripts/anonymization
+        run: |
+          if [[ ${{matrix.benchmark}} == anonymization ]]; then
+            python -m pip install --upgrade pip
+            pip install -r requirements.txt  
+          else
+            echo "Dependency installation not necessary for benchmark"
+          fi
+
       - name: Run benchmark
         run: |
           PGPASSWORD=password dropdb -h localhost -U admin benchbase --if-exists
@@ -509,6 +525,12 @@ jobs:
           if [[ ${{matrix.benchmark}} == templated ]]; then
             java -jar benchbase.jar -b tpcc -c config/postgres/sample_tpcc_config.xml --create=true --load=true --execute=false --json-histograms results/histograms.json
             java -jar benchbase.jar -b ${{matrix.benchmark}} -c config/postgres/sample_${{matrix.benchmark}}_config.xml -im 1000 -mt advanced --create=false --load=false --execute=true --json-histograms results/histograms.json
+          # For anonymization, we load tpcc and anonymize a single table.
+          elif [[ ${{matrix.benchmark}} == anonymization ]]; then
+            java -jar benchbase.jar -b tpcc -c config/postgres/sample_tpcc_config.xml --create=true --load=true --execute=false --json-histograms results/histograms.json
+            java -jar benchbase.jar -b tpcc -c config/postgres/sample_${{matrix.benchmark}}_config.xml --anonymize=true
+            PGPASSWORD=password psql -h localhost -U admin -d benchbase -c "ALTER TABLE item RENAME TO item_original;" -c "ALTER TABLE item_anonymized RENAME TO item;"
+            java -jar benchbase.jar -b tpcc -c config/postgres/sample_tpcc_config.xml --execute=true --json-histograms results/histograms.json
           elif [[ ${{matrix.benchmark}} == tpcc-with-reconnects ]]; then
             # See Also: WITH_SERVICE_INTERRUPTIONS=true docker/build-run-benchmark-with-docker.sh
             java -jar benchbase.jar -b tpcc -c config/postgres/sample_tpcc_config.xml --create=true --load=true
@@ -530,6 +552,8 @@ jobs:
           elif [ ${{matrix.benchmark}} == tpcc-with-reconnects ]; then
               ERRORS_THRESHOLD=0.02
               results_benchmark=tpcc
+          elif [ ${{matrix.benchmark}} == anonymization ]; then
+              results_benchmark=tpcc
           fi
           ./scripts/check_latest_benchmark_results.sh $results_benchmark
           ./scripts/check_histogram_results.sh results/histograms.json $ERRORS_THRESHOLD

diff --git a/.gitignore b/.gitignore
@@ -60,4 +60,5 @@ docker-compose-*.tar.gz
 
 # Python
 __pycache__/
-*.py[cod]
+*.py[cod]
+*.pytest_cache
diff --git a/config/mysql/sample_anonymization_config.xml b/config/mysql/sample_anonymization_config.xml
@@ -30,12 +30,12 @@
                 </categorical>
             <!-- Continuous column fine-tuning -->
                 <continuous>
-                    <column name="i_price" bins="1000" lower="2.0" upper="100.0" /> 
+                    <column name="i_price" bins="20" lower="2.0" upper="100.0" />
                 </continuous>
             </differential_privacy>
             <!-- Sensitive value handling -->
             <value_faking>
-                <column name="i_name" method="name" locales="en_US" seed="0"/>
+                <column name="i_name" method="pystr" locales="en_US" seed="0"/>
             </value_faking>
         </table>
     </anonymization>

diff --git a/config/postgres/sample_anonymization_config.xml b/config/postgres/sample_anonymization_config.xml
@@ -0,0 +1,84 @@
+<?xml version="1.0"?>
+<parameters>
+
+    <!-- Connection details -->
+    <type>POSTGRES</type>
+    <driver>org.postgresql.Driver</driver>
+    <url>jdbc:postgresql://localhost:5432/benchbase?sslmode=disable&amp;ApplicationName=tpcc&amp;reWriteBatchedInserts=true</url>
+    <username>admin</username>
+    <password>password</password>
+    <reconnectOnConnectionFailure>true</reconnectOnConnectionFailure>
+    <isolation>TRANSACTION_SERIALIZABLE</isolation>
+    <batchsize>128</batchsize>
+
+    <!-- Note: this example anonymizes the "item" table of the tpcc workload.
+        To run, use the `anonymize=true` flag
+    -->
+
+     <!-- The anonymization configuration -->
+    <anonymization>
+        <table name="item">
+            <differential_privacy epsilon="1.0" pre_epsilon="0.0" algorithm="mst">
+            <!-- Column categorization -->
+                <ignore>
+                    <column name="i_id"/>
+                    <column name="i_data" />
+                    <column name="i_im_id" />
+                </ignore>
+                <categorical>
+                    <column name="i_name" />
+                </categorical>
+            <!-- Continuous column fine-tuning -->
+                <continuous>
+                    <column name="i_price" bins="20" lower="2.0" upper="100.0" />
+                </continuous>
+            </differential_privacy>
+            <!-- Sensitive value handling -->
+            <value_faking>
+                <column name="i_name" method="pystr" locales="en_US" seed="0"/>
+            </value_faking>
+        </table>
+    </anonymization>
+
+    <!-- Scale factor is the number of warehouses in TPCC -->
+    <scalefactor>1</scalefactor>
+
+    <!-- The workload -->
+    <terminals>1</terminals>
+    <works>
+        <work>
+            <time>60</time>
+            <rate>10000</rate>
+            <weights>45,43,4,4,4</weights>
+        </work>
+    </works>
+
+    <!-- TPCC specific -->
+    <transactiontypes>
+        <transactiontype>
+            <name>NewOrder</name>
+            <!--<preExecutionWait>18000</preExecutionWait>-->
+            <!--<postExecutionWait>12000</postExecutionWait>-->
+        </transactiontype>
+        <transactiontype>
+            <name>Payment</name>
+            <!--<preExecutionWait>3000</preExecutionWait>-->
+            <!--<postExecutionWait>12000</postExecutionWait>-->
+        </transactiontype>
+        <transactiontype>
+            <name>OrderStatus</name>
+            <!--<preExecutionWait>2000</preExecutionWait>-->
+            <!--<postExecutionWait>10000</postExecutionWait>-->
+        </transactiontype>
+        <transactiontype>
+            <name>Delivery</name>
+            <!--<preExecutionWait>2000</preExecutionWait>-->
+            <!--<postExecutionWait>5000</postExecutionWait>-->
+        </transactiontype>
+        <transactiontype>
+            <name>StockLevel</name>
+            <!--<preExecutionWait>2000</preExecutionWait>-->
+            <!--<postExecutionWait>5000</postExecutionWait>-->
+        </transactiontype>
+    </transactiontypes>
+</parameters>
diff --git a/data/templated/example.xml b/data/templated/example.xml
@@ -1,24 +1,21 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!-- Note: these example templates reuse TPC-C queries and data as a demonstration, but others are possible. -->
-<!-- TODO: Add additional parameter data types to increase test coverage. -->
 <templates>
    <template name="GetOrder">
-      <query><![CDATA[SELECT NO_O_ID FROM new_order WHERE NO_D_ID = ? OR NO_D_ID = ? ORDER BY NO_O_ID ASC]]></query>
+      <query>SELECT NO_O_ID FROM new_order WHERE NO_D_ID = ? OR NO_D_ID = ? ORDER BY NO_O_ID ASC</query>
       <types>
          <type>INTEGER</type>
          <type>INTEGER</type>
       </types>
       <values>
-         <value dist="uniform" min="1" max="3"/>
-         <value dist="zipfian" min="1" max="2"/>
+         <value dist="uniform" min="1" max="3" />
+         <value dist="zipfian" min="1" max="2" />
       </values>
        <values>
-         <value dist="scrambled" min="0" max="4" seed="999"/>
-         <value dist="normal" min="1" max="3" seed="1"/>
+         <value dist="scrambled" min="0" max="4" seed="999" />
+         <value dist="normal" min="1" max="3" seed="1" />
       </values>
    </template>
    <template name="GetCust">
-      <query><![CDATA[SELECT C_DISCOUNT, C_LAST, C_CREDIT FROM customer WHERE C_W_ID = ?]]></query>
+      <query>SELECT C_DISCOUNT, C_LAST, C_CREDIT FROM customer WHERE C_W_ID = ?</query>
       <types>
          <type>INTEGER</type>
       </types>
@@ -27,18 +24,16 @@
       </values>
    </template>
    <template name="GetCustNull">
-      <query><![CDATA[SELECT C_DISCOUNT, C_LAST, C_CREDIT FROM customer WHERE C_W_ID = ? OR C_W_ID = ?]]></query>
+      <query>SELECT * FROM item WHERE i_name LIKE ?</query>
       <types>
-         <type>NULL</type>
-         <type>INTEGER</type>
+         <type>VARCHAR</type>
       </types>
       <values>
-        <value>1</value>
-        <value />
+        <value>nvacqmnftdwkxmdp</value>
       </values>
    </template>
    <template name="GetWarehouse">
-      <query><![CDATA[SELECT * FROM warehouse WHERE W_STREET_1 LIKE ?]]></query>
+      <query>SELECT * FROM warehouse WHERE W_STREET_1 LIKE ?</query> 
       <types>
          <type>VARCHAR</type>
       </types>
@@ -47,22 +42,22 @@
       </values>
    </template>
    <template name="GetItemByPrice">
-      <query><![CDATA[SELECT COUNT(*) FROM item WHERE i_price BETWEEN ? AND ?]]></query>
+      <query>SELECT COUNT(*) FROM item WHERE i_price BETWEEN ? AND ?</query>
       <types>
          <type>FLOAT</type>
          <type>FLOAT</type>
       </types>
       <values>
-         <value dist="normal" min="10" max="12.5"/>
-         <value dist="uniform" min="15.1" max="100.22"/>
+         <value dist="normal" min="10" max="12.5" />
+         <value dist="uniform" min="15.1" max="100.22" />
       </values>
       <values>
          <value>10.50</value>
          <value>11</value>
       </values>
    </template>
     <template name="UpdateItemPrice">
-      <query><![CDATA[UPDATE item SET i_price = i_price + 1 WHERE i_price < ?]]></query>
+      <query>UPDATE item SET i_price = i_price + 1 WHERE i_price &lt; ?</query>
       <types>
          <type>INTEGER</type>
       </types>
@@ -71,25 +66,25 @@
       </values>
    </template>
    <template name="DeleteItem">
-      <query><![CDATA[DELETE FROM oorder WHERE o_entry_d < ?]]></query>
+      <query>DELETE FROM oorder WHERE o_entry_d &lt; ?</query>
       <types>
          <type>TIMESTAMP</type>
       </types>
       <values>
-         <value dist="uniform" min="1000" max="300000"></value>
+         <value dist="uniform" min="1000" max="300000" />
       </values>
        <values>
-         <value dist="normal" min="1000" max="300000"></value>
+         <value dist="normal" min="1000" max="300000" />
       </values>
        <values>
-         <value dist="zipfian" min="1000" max="300000"></value>
+         <value dist="zipfian" min="1000" max="300000" />
       </values>
        <values>
-         <value dist="scrambled" min="1000" max="300000"></value>
+         <value dist="scrambled" min="1000" max="300000" />
       </values>
    </template>
    <template name="InsertItem">
-      <query><![CDATA[INSERT INTO history VALUES(?,?,?,?,?,?,?,?)]]></query>
+      <query>INSERT INTO history VALUES(?,?,?,?,?,?,?,?)</query>
       <types>
          <type>INTEGER</type>
          <type>INTEGER</type>
@@ -106,9 +101,9 @@
          <value>1</value>
          <value>1</value>
          <value>1</value>
-         <value dist="uniform" min="1999-12-12 01:01:55" max="2000-12-12 04:04:44"/>
-         <value dist="normal" min="0.5" max="12.5"/>
-         <value dist="uniform" min="0" max="23"/>
+         <value dist="uniform" min="1999-12-12 01:01:55" max="2000-12-12 04:04:44" />
+         <value dist="normal" min="0.5" max="12.5" />
+         <value dist="uniform" min="0" max="23" />
       </values>
    </template>
 </templates>
diff --git a/scripts/anonymization/README.md b/scripts/anonymization/README.md
@@ -1,7 +1,5 @@
 # Anonymization
 
-**NOTE:ANONYMIZATION IS A WORK IN PROGRESS AND DOES CURRENTLY NOT ACTUALLY ANONYMIZE THE DATA. THIS FEATURE WILL BE ADDED LATER**
-
 The anonymization module allows applying privacy mechanisms such as differential privacy or column faking to the data.
 The system will pull data from the JDBC connection, anonymize the data and push it back to the DBMS by creating a new table.
 
@@ -30,10 +28,20 @@ pylint --rcfile=.pylintrc src/
 Pytest is automatically added by the requirements and can be run as follows:
 
 ```bash
-pytest ./src/test.py
+cd ./src
+pytest test.py
 ```
 
+## Applying the Anonymization
+
+To run the anonymization, add the --anonymization flag to your bash command.
+Anonymization is not a standalone benchmark. It will solely create an anonymized copy of input tables.
+
+```bash
+java -jar benchbase.jar -b tpcc -c config/postgres/sample_anonymization_config.xml --anonymize=true
+```
 
+Configuration examples can be found below.
 
 ## Configuration files (XML)
 
@@ -238,6 +246,7 @@ Possible methods for faking can be found on the official [Faker documentation](h
 ### Basic
 
 The most basic config will need only the name of the table and the privacy mechanism. All necessary information is collected automatically.
+This method is not recommended since it typically results in faulty behavior because the anonymization algorithms do not know how to handle certain columns.
 
 ```xml
 <anonymization>