diff --git a/README.md b/README.md
index 334b8c8..d647351 100644
--- a/README.md
+++ b/README.md
@@ -171,6 +171,8 @@ python -W ignore main.py \
   -m defog/sqlcoder2
 ```
 
+Optionally, if you're running evals on a model that is quantized with AWQ, add the `-qz` or `--quantized` parameter. Only applicable for the vllm runner.
+
 If running with different settings, you can setup an api server to avoid reloading for each test setting and then run the tests subsequently. To setup the api server:
 ```bash
 # to set up a vllm server
@@ -236,6 +238,7 @@ You can use the following flags in the command line to change the configurations
 |  -o, --output_file   |  Output CSV file that will store your results. You need to pass the same number of output file paths as the number of prompt files |
 |  -bq, --bq_table   |  Name of BigQuery table to save to (e.g. eval.results). Remember to save your project_id as an environment variable BQ_PROJECT. |
 |  -b, --num_beams   |  Indicates the number of beams you want to use for beam search at inference. Only available for `hf_runner`, `vllm_runner` and `api_runner`. |
+|  -qz, --quantized   |  Indicate whether the model is an AWQ quantized model. Only available for `vllm_runner`. |
 | -p, --parallel_threads  |  The default no. of parallel threads is 5. Decrease this to 1 for gpt-4 to avoid the rate limit error. Parallelization support is currently only defined for OpenAI models.  |
 | -t, --timeout_gen  |  No. of seconds before timeout occurs for query generation. The default is 30.0s. |
 | -u, --timeout_exec  |  No. of seconds before timeout occurs for query execution on the database. The default is 10.0s.  |
@@ -253,4 +256,4 @@ We welcome contributions to our project, specifically:
   - New query generators/runners (in the [query_generators](query_generators) and [eval](eval) folders respectively)
   - Improving existing generators/runners (e.g. adding new metrics)
 
-Please see [CONTRIBUTING.md](https://github.com/defog-ai/sql-generation-evaluation/blob/main/CONTRIBUTING.md) for more information.
\ No newline at end of file
+Please see [CONTRIBUTING.md](https://github.com/defog-ai/sql-generation-evaluation/blob/main/CONTRIBUTING.md) for more information.
diff --git a/eval/vllm_runner.py b/eval/vllm_runner.py
index ab33707..33d39a7 100644
--- a/eval/vllm_runner.py
+++ b/eval/vllm_runner.py
@@ -47,7 +47,14 @@ def run_vllm_eval(args):
     # initialize model only once as it takes a while
     print(f"Preparing {model_name}")
     tokenizer = AutoTokenizer.from_pretrained(model_name)
-    llm = LLM(model=model_name, tensor_parallel_size=torch.cuda.device_count())
+    if not args.quantized:
+        llm = LLM(model=model_name, tensor_parallel_size=torch.cuda.device_count())
+    else:
+        llm = LLM(
+            model=model_name,
+            tensor_parallel_size=torch.cuda.device_count(),
+            quantization="AWQ",
+        )
 
     sampling_params = SamplingParams(
         n=1,
diff --git a/main.py b/main.py
index a22fb54..30fda1c 100644
--- a/main.py
+++ b/main.py
@@ -20,6 +20,9 @@
     parser.add_argument("-t", "--timeout_gen", type=float, default=30.0)
     parser.add_argument("-u", "--timeout_exec", type=float, default=10.0)
     parser.add_argument("-v", "--verbose", action="store_true")
+    parser.add_argument(
+        "-qz", "--quantized", default=False, action=argparse.BooleanOptionalAction
+    )
 
     args = parser.parse_args()