v0.2.0 (#330)

Co-authored-by: jinz2014 <[email protected]> Co-authored-by: Jin Z <[email protected]>
casper-hansen · Feb 15, 2024 · bcaa8a3 · bcaa8a3
1 parent c69d3b6
commit bcaa8a3
Show file tree

Hide file tree

Showing 67 changed files with 2,819 additions and 1,478 deletions.
diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
@@ -91,7 +91,7 @@ jobs:
           # Install torch
           $cudaVersion = $env:CUDA_VERSION.Replace('.', '')
           $cudaVersionPytorch = $cudaVersion.Substring(0, $cudaVersion.Length - 1)
-          if ([int]$cudaVersionPytorch -gt 118) { $pytorchVersion = "torch==2.1.0" } else {$pytorchVersion = "torch==2.0.1"}
+          if ([int]$cudaVersionPytorch -gt 118) { $pytorchVersion = "torch==2.2.0" } else {$pytorchVersion = "torch==2.0.1"}
           python -m pip install --upgrade --no-cache-dir $pytorchVersion+cu$cudaVersionPytorch --index-url https://download.pytorch.org/whl/cu$cudaVersionPytorch
           python -m pip install build setuptools wheel ninja
 

diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml
@@ -0,0 +1,28 @@
+name: Documentation
+on:
+  push:
+    branches:
+      - main
+permissions:
+  contents: write
+jobs:
+  deploy:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - name: Git Credentials
+        run: |
+          git config user.name github-actions[bot]
+          git config user.email 41898282+github-actions[bot]@users.noreply.github.com
+      - uses: actions/setup-python@v4
+        with:
+          python-version: 3.x
+      - run: echo "cache_id=$(date --utc '+%V')" >> $GITHUB_ENV
+      - uses: actions/cache@v3
+        with:
+          key: mkdocs-material-${{ env.cache_id }}
+          path: .cache
+          restore-keys: |
+            mkdocs-material-docs
+      - run: pip install mkdocstrings-python mkdocs-material griffe-typingdoc
+      - run: mkdocs gh-deploy --force
diff --git a/README.md b/README.md
@@ -70,33 +70,6 @@ All three methods will install the latest and correct kernels for your system fr
 
 If your system is not supported (i.e. not on the release page), you can build the kernels yourself by following the instructions in [AutoAWQ_Kernels](https://github.com/casper-hansen/AutoAWQ_kernels/releases) and then install AutoAWQ from source.
 
-## Supported models
-
-The detailed support list:
-
-| Models   | Sizes                       |
-| -------- | --------------------------- |
-| LLaMA-2  | 7B/13B/70B                  |
-| LLaMA    | 7B/13B/30B/65B              |
-| Mistral  | 7B                          |
-| Vicuna   | 7B/13B                      |
-| MPT      | 7B/30B                      |
-| Falcon   | 7B/40B                      |
-| OPT      | 125m/1.3B/2.7B/6.7B/13B/30B |
-| Bloom    | 560m/3B/7B/                 |
-| GPTJ     | 6.7B                        |
-| Aquila   | 7B                          |
-| Aquila2  | 7B/34B                      |
-| Yi       | 6B/34B                      |
-| Qwen     | 1.8B/7B/14B/72B             |
-| BigCode  | 1B/7B/15B                   |
-| GPT NeoX | 20B                         |
-| GPT-J    | 6B                          |
-| LLaVa    | 7B/13B                      |
-| Mixtral  | 8x7B                        |
-| Baichuan | 7B/13B                      |
-| QWen     | 1.8B/7B/14/72B              |
-
 ## Usage
 
 Under examples, you can find examples of how to quantize, run inference, and benchmark AutoAWQ models.
@@ -122,7 +95,7 @@ Fused modules are a large part of the speedup you get from AutoAWQ. The idea is
 - Fused modules are activated when you use `fuse_layers=True`.
 - A custom cache is implemented. It preallocates based on batch size and sequence length.
     - You cannot change the sequence length after you have created your model.
-    - Reference: `AutoAWQForCausalLM.from_quantized(max_new_tokens=seq_len, batch_size=batch_size)`
+    - Reference: `AutoAWQForCausalLM.from_quantized(max_seq_len=seq_len, batch_size=batch_size)`
 - The main accelerator in the fused modules comes from FasterTransformer, which is only compatible with Linux.
 - The `past_key_values` from `model.generate()` are only dummy values, so they cannot be used after generation.
 
@@ -194,7 +167,7 @@ tokens = tokenizer(
 generation_output = model.generate(
     tokens, 
     streamer=streamer,
-    max_new_tokens=512
+    max_seq_len=512
 )
 ```
 

diff --git a/awq/__init__.py b/awq/__init__.py
@@ -1,2 +1,2 @@
-__version__ = "0.1.8"
-from awq.models.auto import AutoAWQForCausalLM
+__version__ = "0.2.0"
+from awq.models.auto import AutoAWQForCausalLM
diff --git a/awq/evaluation/__init__.py b/awq/evaluation/__init__.py
@@ -4,4 +4,4 @@
     eval_mmlu,
 )
 from awq.evaluation.humaneval_utils import eval_humaneval
-from awq.evaluation.kl_divergence import eval_kl_divergence
+from awq.evaluation.kl_divergence import eval_kl_divergence
diff --git a/awq/evaluation/eval_utils.py b/awq/evaluation/eval_utils.py
@@ -9,56 +9,61 @@
 from transformers import AutoModelForCausalLM, AutoTokenizer
 from transformers.models.whisper.english_normalizer import BasicTextNormalizer
 
+
 def get_device():
     if torch.backends.mps.is_available():
-        return 'mps'
+        return "mps"
     elif torch.cuda.is_available():
-        return 'cuda:0'
+        return "cuda:0"
     else:
-        return 'cpu'
+        return "cpu"
+
 
 def evaluate_perplexity(model, tokenizer):
     def _perplexity(nlls, n_samples, seqlen):
         return torch.exp(torch.stack(nlls).sum() / (n_samples * seqlen))
-    
+
     # load and prepare dataset
-    data = load_dataset('wikitext', 'wikitext-2-raw-v1', split='test')
-    data = tokenizer("\n\n".join(data['text']), return_tensors='pt')
+    data = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
+    data = tokenizer("\n\n".join(data["text"]), return_tensors="pt")
     data = data.input_ids.to(model.device)
 
     seqlen = 2048
     model = model.eval()
     n_samples = data.numel() // seqlen
-    
+
     nlls = []
 
     with tqdm(range(n_samples), desc="Perplexity -") as progress_bar:
         for i in progress_bar:
-            start_index = (i * seqlen)
-            end_index = ((i + 1) * seqlen)
+            start_index = i * seqlen
+            end_index = (i + 1) * seqlen
             batch = data[:, start_index:end_index].to(model.device)
             with torch.no_grad():
                 logits = model(batch).logits
             shift_logits = logits[:, :-1, :].contiguous().float()
             shift_labels = data[:, start_index:end_index][:, 1:]
             loss_fct = nn.CrossEntropyLoss()
-            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
+            loss = loss_fct(
+                shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)
+            )
             neg_log_likelihood = loss.float() * seqlen
             nlls.append(neg_log_likelihood)
 
-            curr_ppl = _perplexity(nlls, i+1, seqlen)
+            curr_ppl = _perplexity(nlls, i + 1, seqlen)
             progress_bar.set_description(f"Perplexity {curr_ppl:.3f}")
 
     ppl = _perplexity(nlls, n_samples, seqlen)
-    
+
     return ppl.item()
 
+
 def eval_librispeech(model_id, num_samples=100, batch_size=4):
     try:
         import jiwer, librosa, soundfile
     except ImportError:
         print("Please install the following: pip install jiwer librosa soundfile")
-    
+
     dataset = load_dataset("librispeech_asr", "clean", split="test", streaming=True)
 
     # Load the Whisper model pipeline for automatic speech recognition
@@ -72,14 +77,15 @@ def eval_librispeech(model_id, num_samples=100, batch_size=4):
 
     # Word normalizer
     normalizer = BasicTextNormalizer()
-    
+
     # Load the WER metric
     wer_metric = load_metric("wer")
 
     texts = []
     audio = []
     for i, data in tqdm(enumerate(dataset), total=num_samples, desc="Loading dataset"):
-        if len(audio) == num_samples: break
+        if len(audio) == num_samples:
+            break
         audio.append(data["audio"])
         texts.append(data["text"])
 
@@ -88,8 +94,8 @@ def eval_librispeech(model_id, num_samples=100, batch_size=4):
 
     with tqdm(range(0, num_samples, batch_size), desc="Word Error Rate: -") as pbar:
         for i in pbar:
-            batch_audio = audio[i:i+batch_size]
-            batch_texts = texts[i:i+batch_size]
+            batch_audio = audio[i : i + batch_size]
+            batch_texts = texts[i : i + batch_size]
 
             # inference
             results = pipe(batch_audio, batch_size=len(batch_audio))
@@ -102,16 +108,26 @@ def eval_librispeech(model_id, num_samples=100, batch_size=4):
             references.extend(normalized_texts)
 
             # word error rate computation
-            wer = wer_metric.compute(predictions=predictions, references=references) * 100
+            wer = (
+                wer_metric.compute(predictions=predictions, references=references) * 100
+            )
             pbar.set_description(f"Word Error Rate: {wer:.3f}%")
 
-def eval_mmlu(model_path="gpt2", num_fewshot=1, batch_size=1, device="cuda:0", task_use_pretrained=False):
+
+def eval_mmlu(
+    model_path="gpt2",
+    num_fewshot=1,
+    batch_size=1,
+    device="cuda:0",
+    task_use_pretrained=False,
+):
     try:
         import vllm
+
         VLLM_INSTALLED = True
     except ImportError:
         VLLM_INSTALLED = False
-    
+
     initialize_tasks(verbosity="DEBUG")
 
     if VLLM_INSTALLED:
@@ -133,12 +149,12 @@ def eval_mmlu(model_path="gpt2", num_fewshot=1, batch_size=1, device="cuda:0", t
             dtype="float16",
             trust_remote_code=True,
         )
-    model_args = ",".join([f"{k}={v}" for k,v in model_args.items()])
+    model_args = ",".join([f"{k}={v}" for k, v in model_args.items()])
 
     results = evaluator.simple_evaluate(
         model=model,
         model_args=model_args,
-        tasks=['mmlu'],
+        tasks=["mmlu"],
         num_fewshot=num_fewshot,
         batch_size=batch_size,
         device=device,
@@ -147,7 +163,8 @@ def eval_mmlu(model_path="gpt2", num_fewshot=1, batch_size=1, device="cuda:0", t
 
     print(evaluator.make_table(results))
 
-if __name__ == '__main__':
+
+if __name__ == "__main__":
     ### PERPLEXITY
     # model_path = 'mistralai/Mistral-7B-Instruct-v0.1'
     # model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto")
@@ -156,5 +173,5 @@ def eval_mmlu(model_path="gpt2", num_fewshot=1, batch_size=1, device="cuda:0", t
 
     ### WORD ERROR RATE
     # model_id = "distil-whisper/distil-small.en" # 3.594
-    model_id = "distil-whisper/distil-medium.en" # 3.436
+    model_id = "distil-whisper/distil-medium.en"  # 3.436
     eval_librispeech(model_id)