Skip to content

Commit

Permalink
v0.2.0 (#330)
Browse files Browse the repository at this point in the history
Co-authored-by: jinz2014 <[email protected]>
Co-authored-by: Jin Z <[email protected]>
  • Loading branch information
3 people authored Feb 15, 2024
1 parent c69d3b6 commit bcaa8a3
Show file tree
Hide file tree
Showing 67 changed files with 2,819 additions and 1,478 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/build.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ jobs:
# Install torch
$cudaVersion = $env:CUDA_VERSION.Replace('.', '')
$cudaVersionPytorch = $cudaVersion.Substring(0, $cudaVersion.Length - 1)
if ([int]$cudaVersionPytorch -gt 118) { $pytorchVersion = "torch==2.1.0" } else {$pytorchVersion = "torch==2.0.1"}
if ([int]$cudaVersionPytorch -gt 118) { $pytorchVersion = "torch==2.2.0" } else {$pytorchVersion = "torch==2.0.1"}
python -m pip install --upgrade --no-cache-dir $pytorchVersion+cu$cudaVersionPytorch --index-url https://download.pytorch.org/whl/cu$cudaVersionPytorch
python -m pip install build setuptools wheel ninja
Expand Down
28 changes: 28 additions & 0 deletions .github/workflows/docs.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
name: Documentation
on:
push:
branches:
- main
permissions:
contents: write
jobs:
deploy:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Git Credentials
run: |
git config user.name github-actions[bot]
git config user.email 41898282+github-actions[bot]@users.noreply.github.com
- uses: actions/setup-python@v4
with:
python-version: 3.x
- run: echo "cache_id=$(date --utc '+%V')" >> $GITHUB_ENV
- uses: actions/cache@v3
with:
key: mkdocs-material-${{ env.cache_id }}
path: .cache
restore-keys: |
mkdocs-material-docs
- run: pip install mkdocstrings-python mkdocs-material griffe-typingdoc
- run: mkdocs gh-deploy --force
31 changes: 2 additions & 29 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -70,33 +70,6 @@ All three methods will install the latest and correct kernels for your system fr

If your system is not supported (i.e. not on the release page), you can build the kernels yourself by following the instructions in [AutoAWQ_Kernels](https://github.com/casper-hansen/AutoAWQ_kernels/releases) and then install AutoAWQ from source.

## Supported models

The detailed support list:

| Models | Sizes |
| -------- | --------------------------- |
| LLaMA-2 | 7B/13B/70B |
| LLaMA | 7B/13B/30B/65B |
| Mistral | 7B |
| Vicuna | 7B/13B |
| MPT | 7B/30B |
| Falcon | 7B/40B |
| OPT | 125m/1.3B/2.7B/6.7B/13B/30B |
| Bloom | 560m/3B/7B/ |
| GPTJ | 6.7B |
| Aquila | 7B |
| Aquila2 | 7B/34B |
| Yi | 6B/34B |
| Qwen | 1.8B/7B/14B/72B |
| BigCode | 1B/7B/15B |
| GPT NeoX | 20B |
| GPT-J | 6B |
| LLaVa | 7B/13B |
| Mixtral | 8x7B |
| Baichuan | 7B/13B |
| QWen | 1.8B/7B/14/72B |

## Usage

Under examples, you can find examples of how to quantize, run inference, and benchmark AutoAWQ models.
Expand All @@ -122,7 +95,7 @@ Fused modules are a large part of the speedup you get from AutoAWQ. The idea is
- Fused modules are activated when you use `fuse_layers=True`.
- A custom cache is implemented. It preallocates based on batch size and sequence length.
- You cannot change the sequence length after you have created your model.
- Reference: `AutoAWQForCausalLM.from_quantized(max_new_tokens=seq_len, batch_size=batch_size)`
- Reference: `AutoAWQForCausalLM.from_quantized(max_seq_len=seq_len, batch_size=batch_size)`
- The main accelerator in the fused modules comes from FasterTransformer, which is only compatible with Linux.
- The `past_key_values` from `model.generate()` are only dummy values, so they cannot be used after generation.

Expand Down Expand Up @@ -194,7 +167,7 @@ tokens = tokenizer(
generation_output = model.generate(
tokens,
streamer=streamer,
max_new_tokens=512
max_seq_len=512
)
```

Expand Down
4 changes: 2 additions & 2 deletions awq/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
__version__ = "0.1.8"
from awq.models.auto import AutoAWQForCausalLM
__version__ = "0.2.0"
from awq.models.auto import AutoAWQForCausalLM
2 changes: 1 addition & 1 deletion awq/evaluation/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,4 @@
eval_mmlu,
)
from awq.evaluation.humaneval_utils import eval_humaneval
from awq.evaluation.kl_divergence import eval_kl_divergence
from awq.evaluation.kl_divergence import eval_kl_divergence
65 changes: 41 additions & 24 deletions awq/evaluation/eval_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,56 +9,61 @@
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers.models.whisper.english_normalizer import BasicTextNormalizer


def get_device():
if torch.backends.mps.is_available():
return 'mps'
return "mps"
elif torch.cuda.is_available():
return 'cuda:0'
return "cuda:0"
else:
return 'cpu'
return "cpu"


def evaluate_perplexity(model, tokenizer):
def _perplexity(nlls, n_samples, seqlen):
return torch.exp(torch.stack(nlls).sum() / (n_samples * seqlen))

# load and prepare dataset
data = load_dataset('wikitext', 'wikitext-2-raw-v1', split='test')
data = tokenizer("\n\n".join(data['text']), return_tensors='pt')
data = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
data = tokenizer("\n\n".join(data["text"]), return_tensors="pt")
data = data.input_ids.to(model.device)

seqlen = 2048
model = model.eval()
n_samples = data.numel() // seqlen

nlls = []

with tqdm(range(n_samples), desc="Perplexity -") as progress_bar:
for i in progress_bar:
start_index = (i * seqlen)
end_index = ((i + 1) * seqlen)
start_index = i * seqlen
end_index = (i + 1) * seqlen
batch = data[:, start_index:end_index].to(model.device)
with torch.no_grad():
logits = model(batch).logits
shift_logits = logits[:, :-1, :].contiguous().float()
shift_labels = data[:, start_index:end_index][:, 1:]
loss_fct = nn.CrossEntropyLoss()
loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
loss = loss_fct(
shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)
)
neg_log_likelihood = loss.float() * seqlen
nlls.append(neg_log_likelihood)

curr_ppl = _perplexity(nlls, i+1, seqlen)
curr_ppl = _perplexity(nlls, i + 1, seqlen)
progress_bar.set_description(f"Perplexity {curr_ppl:.3f}")

ppl = _perplexity(nlls, n_samples, seqlen)

return ppl.item()


def eval_librispeech(model_id, num_samples=100, batch_size=4):
try:
import jiwer, librosa, soundfile
except ImportError:
print("Please install the following: pip install jiwer librosa soundfile")

dataset = load_dataset("librispeech_asr", "clean", split="test", streaming=True)

# Load the Whisper model pipeline for automatic speech recognition
Expand All @@ -72,14 +77,15 @@ def eval_librispeech(model_id, num_samples=100, batch_size=4):

# Word normalizer
normalizer = BasicTextNormalizer()

# Load the WER metric
wer_metric = load_metric("wer")

texts = []
audio = []
for i, data in tqdm(enumerate(dataset), total=num_samples, desc="Loading dataset"):
if len(audio) == num_samples: break
if len(audio) == num_samples:
break
audio.append(data["audio"])
texts.append(data["text"])

Expand All @@ -88,8 +94,8 @@ def eval_librispeech(model_id, num_samples=100, batch_size=4):

with tqdm(range(0, num_samples, batch_size), desc="Word Error Rate: -") as pbar:
for i in pbar:
batch_audio = audio[i:i+batch_size]
batch_texts = texts[i:i+batch_size]
batch_audio = audio[i : i + batch_size]
batch_texts = texts[i : i + batch_size]

# inference
results = pipe(batch_audio, batch_size=len(batch_audio))
Expand All @@ -102,16 +108,26 @@ def eval_librispeech(model_id, num_samples=100, batch_size=4):
references.extend(normalized_texts)

# word error rate computation
wer = wer_metric.compute(predictions=predictions, references=references) * 100
wer = (
wer_metric.compute(predictions=predictions, references=references) * 100
)
pbar.set_description(f"Word Error Rate: {wer:.3f}%")

def eval_mmlu(model_path="gpt2", num_fewshot=1, batch_size=1, device="cuda:0", task_use_pretrained=False):

def eval_mmlu(
model_path="gpt2",
num_fewshot=1,
batch_size=1,
device="cuda:0",
task_use_pretrained=False,
):
try:
import vllm

VLLM_INSTALLED = True
except ImportError:
VLLM_INSTALLED = False

initialize_tasks(verbosity="DEBUG")

if VLLM_INSTALLED:
Expand All @@ -133,12 +149,12 @@ def eval_mmlu(model_path="gpt2", num_fewshot=1, batch_size=1, device="cuda:0", t
dtype="float16",
trust_remote_code=True,
)
model_args = ",".join([f"{k}={v}" for k,v in model_args.items()])
model_args = ",".join([f"{k}={v}" for k, v in model_args.items()])

results = evaluator.simple_evaluate(
model=model,
model_args=model_args,
tasks=['mmlu'],
tasks=["mmlu"],
num_fewshot=num_fewshot,
batch_size=batch_size,
device=device,
Expand All @@ -147,7 +163,8 @@ def eval_mmlu(model_path="gpt2", num_fewshot=1, batch_size=1, device="cuda:0", t

print(evaluator.make_table(results))

if __name__ == '__main__':

if __name__ == "__main__":
### PERPLEXITY
# model_path = 'mistralai/Mistral-7B-Instruct-v0.1'
# model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto")
Expand All @@ -156,5 +173,5 @@ def eval_mmlu(model_path="gpt2", num_fewshot=1, batch_size=1, device="cuda:0", t

### WORD ERROR RATE
# model_id = "distil-whisper/distil-small.en" # 3.594
model_id = "distil-whisper/distil-medium.en" # 3.436
model_id = "distil-whisper/distil-medium.en" # 3.436
eval_librispeech(model_id)
Loading

0 comments on commit bcaa8a3

Please sign in to comment.