diff --git a/README.md b/README.md index 334b8c8..d647351 100644 --- a/README.md +++ b/README.md @@ -171,6 +171,8 @@ python -W ignore main.py \ -m defog/sqlcoder2 ``` +Optionally, if you're running evals on a model that is quantized with AWQ, add the `-qz` or `--quantized` parameter. Only applicable for the vllm runner. + If running with different settings, you can setup an api server to avoid reloading for each test setting and then run the tests subsequently. To setup the api server: ```bash # to set up a vllm server @@ -236,6 +238,7 @@ You can use the following flags in the command line to change the configurations | -o, --output_file | Output CSV file that will store your results. You need to pass the same number of output file paths as the number of prompt files | | -bq, --bq_table | Name of BigQuery table to save to (e.g. eval.results). Remember to save your project_id as an environment variable BQ_PROJECT. | | -b, --num_beams | Indicates the number of beams you want to use for beam search at inference. Only available for `hf_runner`, `vllm_runner` and `api_runner`. | +| -qz, --quantized | Indicate whether the model is an AWQ quantized model. Only available for `vllm_runner`. | | -p, --parallel_threads | The default no. of parallel threads is 5. Decrease this to 1 for gpt-4 to avoid the rate limit error. Parallelization support is currently only defined for OpenAI models. | | -t, --timeout_gen | No. of seconds before timeout occurs for query generation. The default is 30.0s. | | -u, --timeout_exec | No. of seconds before timeout occurs for query execution on the database. The default is 10.0s. | @@ -253,4 +256,4 @@ We welcome contributions to our project, specifically: - New query generators/runners (in the [query_generators](query_generators) and [eval](eval) folders respectively) - Improving existing generators/runners (e.g. adding new metrics) -Please see [CONTRIBUTING.md](https://github.com/defog-ai/sql-generation-evaluation/blob/main/CONTRIBUTING.md) for more information. \ No newline at end of file +Please see [CONTRIBUTING.md](https://github.com/defog-ai/sql-generation-evaluation/blob/main/CONTRIBUTING.md) for more information. diff --git a/eval/vllm_runner.py b/eval/vllm_runner.py index ab33707..33d39a7 100644 --- a/eval/vllm_runner.py +++ b/eval/vllm_runner.py @@ -47,7 +47,14 @@ def run_vllm_eval(args): # initialize model only once as it takes a while print(f"Preparing {model_name}") tokenizer = AutoTokenizer.from_pretrained(model_name) - llm = LLM(model=model_name, tensor_parallel_size=torch.cuda.device_count()) + if not args.quantized: + llm = LLM(model=model_name, tensor_parallel_size=torch.cuda.device_count()) + else: + llm = LLM( + model=model_name, + tensor_parallel_size=torch.cuda.device_count(), + quantization="AWQ", + ) sampling_params = SamplingParams( n=1, diff --git a/main.py b/main.py index a22fb54..30fda1c 100644 --- a/main.py +++ b/main.py @@ -20,6 +20,9 @@ parser.add_argument("-t", "--timeout_gen", type=float, default=30.0) parser.add_argument("-u", "--timeout_exec", type=float, default=10.0) parser.add_argument("-v", "--verbose", action="store_true") + parser.add_argument( + "-qz", "--quantized", default=False, action=argparse.BooleanOptionalAction + ) args = parser.parse_args()