diff --git a/Dockerfile b/Dockerfile index f41753aeb52a6..6ef03b843f457 100644 --- a/Dockerfile +++ b/Dockerfile @@ -41,14 +41,6 @@ ENV NVCC_THREADS=$nvcc_threads RUN python3 setup.py build_ext --inplace -# Build the megablocks library as wheel because it doesn't publish pre-built wheels. -# https://github.com/stanford-futuredata/megablocks/commit/5897cd6f254b7b3edf7a708a3a3314ecb54b6f78 -RUN apt-get install -y git && \ - git clone https://github.com/stanford-futuredata/megablocks.git && \ - cd megablocks && \ - git checkout 5897cd6f254b7b3edf7a708a3a3314ecb54b6f78 && \ - MAX_JOBS=8 NVCC_THREADS=8 python3 setup.py bdist_wheel - # image to run unit testing suite FROM dev AS test @@ -85,12 +77,8 @@ FROM vllm-base AS vllm-openai RUN --mount=type=cache,target=/root/.cache/pip \ pip install accelerate -COPY vllm vllm COPY --from=build /workspace/vllm/*.so /workspace/vllm/ -COPY --from=build /workspace/megablocks/dist/*.whl /tmp/ -RUN --mount=type=cache,target=/root/.cache/pip \ - pip install /tmp/megablocks-0.5.0-cp310-cp310-linux_x86_64.whl && \ - rm /tmp/megablocks-0.5.0-cp310-cp310-linux_x86_64.whl +COPY vllm vllm ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"] diff --git a/README.md b/README.md index 84cadee4839fc..e4b3b50260182 100644 --- a/README.md +++ b/README.md @@ -72,10 +72,6 @@ Install vLLM with pip or [from source](https://vllm.readthedocs.io/en/latest/get ```bash pip install vllm ``` -**NOTE:** The Mixtral model additionally requires `megablocks` which can be installed with pip or [from source](https://github.com/stanford-futuredata/megablocks): -```bash -pip install megablocks -``` ## Getting Started diff --git a/docs/source/getting_started/installation.rst b/docs/source/getting_started/installation.rst index c189a83a2e6ce..e7a2d0a6f0d03 100644 --- a/docs/source/getting_started/installation.rst +++ b/docs/source/getting_started/installation.rst @@ -20,7 +20,7 @@ You can install vLLM using pip: .. code-block:: console $ # (Optional) Create a new conda environment. - $ conda create -n myenv python=3.8 -y + $ conda create -n myenv python=3.9 -y $ conda activate myenv $ # Install vLLM with CUDA 12.1. @@ -34,8 +34,9 @@ You can install vLLM using pip: .. code-block:: console $ # Install vLLM with CUDA 11.8. - $ # Replace `cp310` with your Python version (e.g., `cp38`, `cp39`, `cp311`). - $ pip install https://github.com/vllm-project/vllm/releases/download/v0.2.2/vllm-0.2.2+cu118-cp310-cp310-manylinux1_x86_64.whl + $ export VLLM_VERSION=0.2.4 + $ export PYTHON_VERSION=39 + $ pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cu118-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux1_x86_64.whl $ # Re-install PyTorch with CUDA 11.8. $ pip uninstall torch -y diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst index 545e41829bba2..44e4fe5ead988 100644 --- a/docs/source/models/supported_models.rst +++ b/docs/source/models/supported_models.rst @@ -73,6 +73,9 @@ If your model uses one of the above model architectures, you can seamlessly run Otherwise, please refer to :ref:`Adding a New Model ` for instructions on how to implement support for your model. Alternatively, you can raise an issue on our `GitHub `_ project. +.. note:: + Currently, the ROCm version of vLLM supports Mistral and Mixtral only for context lengths up to 4096. + .. tip:: The easiest way to check if your model is supported is to run the program below: @@ -84,12 +87,17 @@ Alternatively, you can raise an issue on our `GitHub `_ instead of HuggingFace Hub, set an environment variable: .. code-block:: shell $ export VLLM_USE_MODELSCOPE=True + And use with :code:`trust_remote_code=True`. + .. code-block:: python from vllm import LLM @@ -97,5 +105,3 @@ Alternatively, you can raise an issue on our `GitHub str: + prompts = [] + with open(filename, "r") as f: + prompt = f.readline() + prompts.append(prompt) + return prompts @pytest.fixture def example_prompts() -> List[str]: - return _TEST_PROMPTS + prompts = [] + for filename in _TEST_PROMPTS: + prompts += _read_prompts(os.path.join("tests", filename)) + return prompts + + +@pytest.fixture +def example_long_prompts() -> List[str]: + prompts = [] + for filename in _LONG_PROMPTS: + prompts += _read_prompts(os.path.join("tests", filename)) + return prompts _STR_DTYPE_TO_TORCH_DTYPE = { diff --git a/tests/models/test_mistral.py b/tests/models/test_mistral.py new file mode 100644 index 0000000000000..83316fcb7469d --- /dev/null +++ b/tests/models/test_mistral.py @@ -0,0 +1,37 @@ +"""Compare the outputs of HF and vLLM for Mistral models using greedy sampling. + +Run `pytest tests/models/test_mistral.py --forked`. +""" +import pytest + +MODELS = [ + "mistralai/Mistral-7B-Instruct-v0.1", +] + + +@pytest.mark.parametrize("model", MODELS) +@pytest.mark.parametrize("dtype", ["bfloat16"]) +@pytest.mark.parametrize("max_tokens", [128]) +def test_models( + hf_runner, + vllm_runner, + example_long_prompts, + model: str, + dtype: str, + max_tokens: int, +) -> None: + hf_model = hf_runner(model, dtype=dtype) + hf_outputs = hf_model.generate_greedy(example_long_prompts, max_tokens) + del hf_model + + vllm_model = vllm_runner(model, dtype=dtype) + vllm_outputs = vllm_model.generate_greedy(example_long_prompts, max_tokens) + del vllm_model + + for i in range(len(example_long_prompts)): + hf_output_ids, hf_output_str = hf_outputs[i] + vllm_output_ids, vllm_output_str = vllm_outputs[i] + assert hf_output_str == vllm_output_str, ( + f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}") + assert hf_output_ids == vllm_output_ids, ( + f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}") diff --git a/tests/prompts/example.txt b/tests/prompts/example.txt new file mode 100644 index 0000000000000..e1b97bc6eee75 --- /dev/null +++ b/tests/prompts/example.txt @@ -0,0 +1,8 @@ +vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs. +Briefly describe the major milestones in the development of artificial intelligence from 1950 to 2020. +Compare and contrast artificial intelligence with human intelligence in terms of processing information. +Describe the basic components of a neural network and how it can be trained. +Write a short story about a robot that dreams for the first time. +Analyze the impact of the COVID-19 pandemic on global economic structures and future business models. +Explain the cultural significance of the Mona Lisa painting, and how its perception might vary in Western versus Eastern societies. +Translate the following English sentence into Japanese, French, and Swahili: 'The early bird catches the worm.' diff --git a/tests/prompts/summary.txt b/tests/prompts/summary.txt new file mode 100644 index 0000000000000..2f947a264ce93 --- /dev/null +++ b/tests/prompts/summary.txt @@ -0,0 +1 @@ +Subtitles: for our annual races at Knockhill Circuit.Today\'s racing comes from the Porsche Carrera Cup Great Britainand the Legends Cars Elite Cup with JLM.It\'s the latter who get us underway with their first race of the day,and joining me in the commentary box is Paul O\'Neill.First race of the day for the Legends.Jonty Norman has drawn pole position,with Matt Knight alongside.Marcus Pett on Row 2 with Daniel Pooley.Declan Burke is next up, and then Tyler Read, on Row 3.He\'s leading the rookie championship at the moment.Chris Needham on Row 4 with Luke Simmons.Andrew Rogerson and Gareth Sheridan on Row 5.Sixth row, Peter Barrable, with Charlie Budd.Row 7, Jack Parker, fourth in the championship right now.Nick Price is next to him.Will Gibson, who looks like he\'s out of the championship contention now,with Oli Schlup alongside.Then Ben McNeice and Flight Lieutenant Matt Isherwood.Robert Barrable, championship leader, he\'s on Row 10.Then Brent Bowie from Kieran Beattie and Nick Bridgeman.Mike Schlup on Row 12, followed by Ryan McLeish,who won the day overall yesterday.Mark Beaty, Row 13, with Andy Bird.Then it\'s Ben Higgins and Nathan Anthony.Connor Mills and Paul Musselle complete Row 15.And completing the grid is James Newbery.Here we go, with Race number 1 of the day,the final day of the first ever Legends Cars Elite Cup with JLM.And on the front row, it\'s Jonty Norman in grey,Matt Knight in black and gold.Coming from third place on the grid is Marcus Pett,who goes left of shot in the gunmetal carto challenge for the lead.Marcus Pett, the man from Boston in Lincolnshire,goes through into lead position.Very definitely a fancied championship runnerbut hasn\'t quite had the rub of the green this weekend.And they all pile into McIntyre\'s for the first time.And this is where we look for driving standards.James Newbery brakes at the back.He\'s got Paul Musselle immediately in front of him.Those two had an interesting battle yesterdayinvolving a little bit of contact, I think,but they\'re both all right at the moment, as they clear the chicane for the first time.Marcus Pett is away.The difference you\'ll see in Legends Cars racing todayis that for this meeting,the bump drafting that we\'ve seen in the pasthas been ruled out for this round,and it\'s under review for the future.But look at the battle for second position, three wide,as Marcus Pett comes in front of the crowds here.Matt Knight on the inside, Dan Pooley on the outside in 32.Dan Pooley challenging for third. He had a strong day yesterday -he was up in the top ten, which was great to see.The man from March.That third car there, eclipsed at the moment,comes out of the slipstream.Dan repaired his own car after Croft,and that of Kieran Beaty,so I know Kieran wanted to thank him for that. He\'s been working hard.And Pooley side by side with Matt Knight.We\'ve got the 13, Chris Needham car, up there in the mix as well.The three top guys in the...Ryan McLeish getting very sideways there,the Scot in the 71 car.The first time we\'ve seen him on our ITV coverage.He\'s not a guest driver this week.I suppose you could technically call him a guest,but he\'s fully championship registeredand took a splendid win yesterday - overall win and race win.Overall on points.Sorry, Paul, gets a chance to get you in.That\'s Jack Parker!Oh, what\'s happened there?So, this was the start. They\'re all still warming the tyres up,ready for the lights to go green,which they do... around about now.And they get going.And then there was a car, wasn\'t there?Oh, I tell you what, that could\'ve ended up really nastyas it snaked up the grass.Yeah, I\'ll tell you what, the moment when the lights went outwas when Marcus Pett broke ranks.That was a very, very meticulous start from Marcus Pett.The blue car here is Tyler Read, top rookie,who looks like he\'s going down the inside of Daniel Pooley,so he\'s gonna make a space here.So, Dan Pooley has lost second position.It\'s Marcus Pett still out front. Matt Knight...I was saying to the drivers,"Don\'t go away if you\'re in the lead because you won\'t get any coverage." Pett\'s down the road, isn\'t he? Look at the gap he\'s got. Yeah.He\'s got three seconds. It\'s gonna be more than that.What I was quite concerned about was the damp part of the circuitdown at the hairpin, where you need to be down the inside of peopleto get the braking done,but these guys seem to be all respecting...Not track limits, but they\'re respecting each other around usbecause I was quite concerned about coming here,but this is quite synonymous with Legends racing at Knockhill.And look at this now. Knight has got...Look at that. I remember Marcus getting his first race win,which was at Snetterton years ago.It\'s always fantastic to see a first-time winner.And Tyler Read is giving him a great workout.Matt Knight back in third.It\'s between the top two at the moment. Oh! Tyler goes wide.He\'s throwing the car around.Marcus Pett, looking a little bit smoother in the 79,was very frustrated yesterday, but Read\'s all over him.Yeah, but look at this now.You\'ve got third, fourth, fifth and sixth.This is gonna be absolutely spectacular!Tyler Read\'s gone! What\'s gone on?!Oh, has the Treherne engine gone pop? He\'s lost a lot of ground.Is he gonna come back into it?Now it\'s Knight having a go on the outside line again.Matt Knight can\'t do it. He runs out wide.Oli Schlup\'s coming through.Schlup hasn\'t had a win yet in Legends cars, so he\'s queueing up.They\'re coming onto the last lap.This could be a key moment for Oli Schlup,who\'s back in third in the K-Seal car.Across the line.Marcus Pett soaking up the pressure brilliantly so far.But does he need to be in front as they come onto the last lap?I don\'t know, but I think Read must have missed a gear,as someone\'s exited stage left.Look at that, back in the mix!It\'s now six for the lead. Can Pett hold on?Championship leader Robert Barrablehas come through from about three rows from the back,and he\'s at the back of the train.Barrable here is gonna extend his championship leadand start towards the front of the grid for Race 2.Barrable, the Irishman, he\'s there.The white car with the green and orange stripeson the nose cone of the car.But it\'s Marcus Pett out front at the moment... Oh!Matt Isherwood\'s rejoined at the back in the black and green.Isherwood\'s got back at them. Matt Knight\'s having a go.Along Railway Straight.Schlup would normally bump draft him. He can\'t do that on the rules.But look at Marcus Pett.Fairly wide-ish line in. Good defensive stuff from Pett.It\'s all about the run up to the hill now.And Marcus Pett is gonna take the win, I think.Here they come, up towards the line. Pett from Matt Knight.It\'s gonna be Matt\'s best resultin the Legends Cars National Championship.Third position goes to Oli Schlup, who is delighted with that.Then it was Tyler Read. Great race from him.Robert Barrable, though...Barrable, from 19th on the grid, without bump drafting,comes through into fifth placeahead of the excellent recovery from Flight Lieutenant Matt Isherwood.Dan Pooley seventh. Another great result for Dan Pooley.So much to take away from those last racing laps.Oh, and those last four lapsis exactly why we have these Legends on the TOCA package.That was exceptional.Marcus Pett looked like a dead cert not to finish first,but congratulations to you. That was brilliant.But Barrable, after exiting stage leftwhen he caught the back of everybody and got right up there...There\'s too much to talk about. Let\'s just talk about this guy.Pett, you are a legend, mate. Well done.Cracking. It is a lad and dad.Literally, Marcus and his dad, Robert, they look after the car.It is lad and dad. We hear that mentioned in other formulas,but genuinely, that is all it is.It is very difficult for drivers like that and teams like thatto come and race on this stage.It is a big thing. And he\'s such a smashing guy.And his dad as well. Really delighted with the win.Super stuff by Matt Knight. brilliant from Oli Schlup.Fantastic as well from Tyler Read.And on the front row,it\'s Jonty Norman in grey, Matt Knight in black and gold.Coming from third place on the grid is Marcus Pett.Bit of a shemozzle at the back.Two cars hooked up, which is not good to see.Oh, has the Treherne engine gone pop? He\'s lost a lot of ground.Now it\'s Knight having a go on the outside line again.Matt Knight can\'t do it. He runs out wide.Oli Schlup\'s coming through.And Marcus Pett is gonna take the win, I think. Pett from Matt Knight. It\'s gonna be Matt\'s best resultin the Legends Cars National Championship.Here\'s how they finished.Marcus Pett takes another win in the Legends Cars Elite Cup with JLM.READS INFOREADS INFOREADS INFOREADS INFOREADS INFOREADS INFOProblems in that race for Ryan McLeish, yesterday\'s winner.Charlie Budd in 30th.And the other driver having problems, obviously,from that first stoppage, Brent Bowie.Marcus, that was a tough racebecause there was a red flag in the middle of it.Actually, the first bit, you got away,but it was a full reset,and pressure throughout to the chequered flag.Yeah, definitely.We had an ideal start and managed to build up a lead early on,which was great, but when you\'re in that position,the last thing you want to see is a red flag. iming line at the end of lap one.So, Gus Burton leads the way.Big, big dive by Foster on the inside,to go back ahead of Wylie.He goes off the road and back on again.He\'s all sideways.And diving up on the outside line comes Ryan Ratcliffe.Wylie here battling with one of the Pro category cars,but behind him, all the Pro-Am opposition crawling all over him.Well, that was dramatic stuff, wasn\'t it?Round the outside of Turn 1, put Harry Foster in the wrong place.That was Max Bird going wide, number 44, the pink and blue car.So that\'s just haemorrhaged places in Pro-Am.And he\'s the... Oh, a puncture.There\'s somebody with a puncture. Is that Angus Whiteside? Possibly.Let\'s see.I think it is. And you\'ve got this damp patch on the inside,on the braking there, just at the final into the hairpin.This has been a dramatic start to this race for Porsches.Absolutely right.Coming up over the timing line, Gus Burton leads the way.Nine tenths of a second to the good.Big effort being made by Jason Lockwoodin the yellow and orange car in the background, look,to try to get up the inside line, then diving down towards Turn 1.Goes ahead of Oliver White, the very experienced Formula 4 champion.In the silver car, Oliver White, back into Carrera Cup.Remember, he did a full season last year.Good to have him back on the grid.As the cars clamber their way up over the kerb,through the chicane.But Gus Burton saying to everybody, "I\'m back." He leads.Yeah, a dramatic way for Gus Burton to come back to this championship.Remember, he started this year with Century Motorsport but then ducked out of the championship prior to Thruxton.He\'s still competing in the Supercup series with Fach Auto.As there in the pits, getting a new rear left tyre, is Angus Whiteside.But Gus Burton absolutely on it.Very quick in testing here during the week.They tested on Wednesday and on Friday.Gus Burton very quick in...And he\'s really enjoying life now.Back in the championship with the NAPA Racing UK supportand with a different team, Nick Tandy\'s JTR outfit.And he\'s done the fastest lap of the race, as he leads.He is not in the championship fight, but he wants to win races.Car off. It\'s Max Bird again.So, Max Bird, the Pro-Am championship leader,three times a winner in class this year,off the road and back on again.But that\'s gonna throw him way, way down the order.This race is going from bad to worse for him.It\'s just completely unfolded for poor Max Bird.That\'s the curse of having our camera on board, I think,but it\'s just unravelled after a great qualifying.Now, you were talking about Gus Burton\'s start,and it is going to be investigated after the race.OK. Well, it\'ll take a lot of camera action analysisto look at it. This is on board with Bird.Round Turn 1.All OK there. Very close... Goes to the outside.That\'s dangerous cos you can get knocked wide,and that\'s exactly what happens.The man he was trying to get past, Josh Stanton,who spent last night trackside at Cowdenbeath watching stock cars.I\'m not suggesting for a moment he\'s learnt how to defend,but he was enjoying himself, watching a different form of racing.I think all the best people were at Cowdenbeath, weren\'t they?Nick Tandy was, and others. Oh!As there, absolutely on the giddy limit, is Harry Foster,making his way in sixth place.Down towards the hairpin.He\'s dropped back from that leading quintet,but he\'s keeping Ross Wylie at bay.Ross Wylie, there, creeping into shot, leads now Pro-Amahead of Ryan Ratcliffe.And Josh Stanton is third in Pro-Am, last year\'s Am champion.Yeah, and Ross Wylie the only Scottish driver in the race. A lot of support for him,from local sponsors as well as the public.Buoyed by his recent run at the British Grand Prix at Supercup,and thoroughly loving racing at his home circuit, Ross Wylie.Track is nicely dry.There was some threats of possible rain.We had rain yesterday during qualifying.They actually only got one runon their slick tyres yesterday in qualifyingbefore the rain arrived, and that set the grid.So, Gus Burton\'s lead growing all the time.1.3 seconds now, that margin over Adam Smalley.As Max Bird tries to fight back in Pro-Am.Gets up the inside line there.So, that puts him ahead of David Stirling.So, he\'s split the second and third Am fightas he tries to recover.Yeah, but he\'s lost a lot of ground with that momenton the outside of McIntyre\'s.It\'s getting a lot darker overhead at Knockhill,even though there is a break in the cloud.A big effort there from the lapped car of Angus Whiteside.He\'s not fighting for position, he\'s trying to unlap himself.But just wonder whether we might get so f the right of McIntyre\'s,up towards Butcher\'s, then the chicane.And looking to try and maintain this 100% recordin the Team Parker Racing-run car in Am.Yeah. David Fairbrother in second place,but some 11 seconds behind in the Am category.But he will take another podium.His second in the championship, too, Justin Sherwood.The race leader 2.5 seconds to the good, Gus Burton.Other battles still to be resolved.What\'s going on in Pro-Am? Ross Wylie leads.He\'s fallen back behind Josh Malin overall. That was the move.Josh Malin through on the inside at the hairpin.Ross Wylie, in a sense, content to let that happen - gave him room -because that\'s not his battle, but what it does meanis that Ryan Ratcliffe, his class rival,is directly behind him.This is William Aspin versus Max Bird for sixth in Pro-Am.And a very determined Max Bird goes one side, get his nose chopped off.Will Aspin, the man from Florence, defends on the other side.They\'re absolutely together, almost touching.Here comes Max Bird.Oh, but he can\'t find a way through there.Angus Whiteside is now getting in on the act.Round the outside goes Max Bird, but they both take it wide,and through goes Angus Whiteside on the inside.Doesn\'t affect the race order.Whiteside unlaps himself from those two cars. Will Aspin stays ahead. Max Bird tries to fight back.Down towards Duffus Dip.Ignore the car in the lead of this battle packbecause it\'s not on the lead lap.But then Aspin under attack.Max Bird tries to get up alongside himfor the inside line coming into McIntyre\'s.He is on the inside, and he is ahead now.Yeah. And behind him, there was a car completely off on the grassafter Turn 1.So I do think that section of the track is a little slippery,for whatever reason. Maybe it just hasn\'t quite dried out.But this was a great battle between Max Bird and Will Aspin.So, drivers, in one or two cases,setting personal best lap times last time around,suggesting that the road is drying still.The cars are getting lighter on fuel anyway.Down at the hairpin comes the recovering Max Bird,as over the line goes Harry Foster, being chased by Josh Malin.Josh up into seventh overall.A top six could be on - he\'s only half a second back.Yeah, it\'s not far away, is it?And still plenty of laps left in this race.You probably noticed through that Turn 1the drivers are not riding the big kerb on the inside.That\'s because it\'s a new kerb that\'s been put in, actually,to raise the level of the kerbback to the level it was before the track got resurfaced twice.But with the resurfacing twice,it had raised the track surface by 80mm,and the drivers found they were, in previous years,able to use that kerb.Now? Not so much.So, there going through is Oliver Wight in the silver car,down towards the hairpin.Jason Lockwood ahead of him.Jason for EXCELR8, and he is running in 12 at the moment,which is potentially going to be his best finish of the year.It\'s been a tough season for Jason,but he could be on for his best results thus far.However, Gus Burton has rather dominated this,and look at the gap that he\'s pulled.Adam Smalley, as we suggested earlier,might be thinking about banking points,but it doesn\'t look as though he\'s been able to do anything at allabout that JTR car ahead.No. In terms of pure speed,he hasn\'t been able to threaten Gus Burton at all, has he? Gus Burton has led every race.As he\'s now passing David Fairbrotherat the back of the field.But he\'s had this race under control.But unfortunately, he\'s got this investigation after the racefor a possible false start hanging over him.And if, if, if anything is found, and it\'s a false start,normally that\'s a ten-second penalty,and he\'s not ten seconds ahead,so there is gonna be a postscript to this story, that\'s for sure.Now, this is Henry Dawes, Ollie Jacksoncoming through the chicane.Dawes goes wide, goes through the gravel,goes over the grass, loses a place,gets it all sideways, but just about saves it by the end of the straight.Yeah, nearly lost it on the wet grass.Oh. Harry Foster.This is passing David Fairbrother again, further back.So, this is Smalley versus Matty Graham for second place.So, this gap has come r. \n\n Your task is to create long detailed paragraph-by-paragraph summary. Detailed paragraph-by-paragraph summary of the text above: \ No newline at end of file diff --git a/vllm/__init__.py b/vllm/__init__.py index 3121d1169027e..5b14ab4d3c413 100644 --- a/vllm/__init__.py +++ b/vllm/__init__.py @@ -8,7 +8,7 @@ from vllm.outputs import CompletionOutput, RequestOutput from vllm.sampling_params import SamplingParams -__version__ = "0.2.4" +__version__ = "0.2.5" __all__ = [ "LLM", diff --git a/vllm/config.py b/vllm/config.py index 6bafa73c7a981..eb1fee0f258b3 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -120,14 +120,16 @@ def _verify_load_format(self) -> None: if load_format == "auto": load_format = "pt" - # FIXME(woosuk): This is a temporary hack. Support safetensor weights. + # TODO: Remove this check once HF updates the pt weights of Mixtral. architectures = getattr(self.hf_config, "architectures", []) - if "MixtralForCausalLM" in architectures and load_format != "pt": - logger.info( - "Currently, only 'pt' format is supported for Mixtral. " - "Changing the format to 'pt'. This may re-download the " - "weights if you have downloaded the safetensor weights.") - load_format = "pt" + if "MixtralForCausalLM" in architectures: + if load_format == "pt": + raise ValueError( + "Currently, the 'pt' format is not supported for Mixtral. " + "Please use the 'safetensors' format instead. ") + elif load_format == "auto": + # Do not fall back to pt weights. + load_format = "safetensors" self.load_format = load_format diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index 4afb96ecb0042..d854a20b8b95a 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -2,7 +2,7 @@ import time from functools import partial from typing import (Any, Dict, Iterable, List, Optional, Set, Tuple, Type, - Union) + Union, AsyncIterator) from vllm.config import ModelConfig from vllm.engine.arg_utils import AsyncEngineArgs @@ -401,11 +401,12 @@ async def add_request( return stream async def generate( - self, - prompt: Optional[str], - sampling_params: SamplingParams, - request_id: str, - prompt_token_ids: Optional[List[int]] = None) -> RequestOutput: + self, + prompt: Optional[str], + sampling_params: SamplingParams, + request_id: str, + prompt_token_ids: Optional[List[int]] = None + ) -> AsyncIterator[RequestOutput]: """Generate outputs for a request. Generate outputs for a request. This method is a coroutine. It adds the diff --git a/vllm/model_executor/model_loader.py b/vllm/model_executor/model_loader.py index e7bd7548afd29..37543d8c9838e 100644 --- a/vllm/model_executor/model_loader.py +++ b/vllm/model_executor/model_loader.py @@ -7,54 +7,9 @@ from transformers import PretrainedConfig from vllm.config import ModelConfig -from vllm.model_executor.models import * +from vllm.model_executor.models import ModelRegistry from vllm.model_executor.weight_utils import (get_quant_config, initialize_dummy_weights) -from vllm.utils import is_hip -from vllm.logger import init_logger - -logger = init_logger(__name__) - -# TODO(woosuk): Lazy-load the model classes. -_MODEL_REGISTRY = { - "AquilaModel": AquilaForCausalLM, - "AquilaForCausalLM": AquilaForCausalLM, # AquilaChat2 - "BaiChuanForCausalLM": BaiChuanForCausalLM, # baichuan-7b - "BaichuanForCausalLM": BaichuanForCausalLM, # baichuan-13b - "BloomForCausalLM": BloomForCausalLM, - "ChatGLMModel": ChatGLMForCausalLM, - "ChatGLMForConditionalGeneration": ChatGLMForCausalLM, - "FalconForCausalLM": FalconForCausalLM, - "GPT2LMHeadModel": GPT2LMHeadModel, - "GPTBigCodeForCausalLM": GPTBigCodeForCausalLM, - "GPTJForCausalLM": GPTJForCausalLM, - "GPTNeoXForCausalLM": GPTNeoXForCausalLM, - "InternLMForCausalLM": InternLMForCausalLM, - "LlamaForCausalLM": LlamaForCausalLM, - "LLaMAForCausalLM": LlamaForCausalLM, # For decapoda-research/llama-* - "MistralForCausalLM": MistralForCausalLM, - "MixtralForCausalLM": MixtralForCausalLM, - # transformers's mpt class has lower case - "MptForCausalLM": MPTForCausalLM, - "MPTForCausalLM": MPTForCausalLM, - "OPTForCausalLM": OPTForCausalLM, - "PhiForCausalLM": PhiForCausalLM, - "QWenLMHeadModel": QWenLMHeadModel, - "RWForCausalLM": FalconForCausalLM, - "YiForCausalLM": YiForCausalLM, -} - -# Models to be disabled in ROCm -_ROCM_UNSUPPORTED_MODELS = [] -if is_hip(): - for rocm_model in _ROCM_UNSUPPORTED_MODELS: - del _MODEL_REGISTRY[rocm_model] - -# Models partially supported in ROCm -_ROCM_PARTIALLY_SUPPORTED_MODELS = { - "MistralForCausalLM": - "Sliding window attention is not supported in ROCm's flash attention", -} @contextlib.contextmanager @@ -69,19 +24,12 @@ def _set_default_torch_dtype(dtype: torch.dtype): def _get_model_architecture(config: PretrainedConfig) -> Type[nn.Module]: architectures = getattr(config, "architectures", []) for arch in architectures: - if arch in _MODEL_REGISTRY: - if is_hip() and arch in _ROCM_PARTIALLY_SUPPORTED_MODELS: - logger.warning( - f"{arch} is not fully supported in ROCm. Reason: " - f"{_ROCM_PARTIALLY_SUPPORTED_MODELS[arch]}") - return _MODEL_REGISTRY[arch] - elif arch in _ROCM_UNSUPPORTED_MODELS: - raise ValueError( - f"Model architecture {arch} is not supported by ROCm for now. \n" - f"Supported architectures {list(_MODEL_REGISTRY.keys())}") + model_cls = ModelRegistry.load_model_cls(arch) + if model_cls is not None: + return model_cls raise ValueError( f"Model architectures {architectures} are not supported for now. " - f"Supported architectures: {list(_MODEL_REGISTRY.keys())}") + f"Supported architectures: {ModelRegistry.get_supported_archs()}") def get_model(model_config: ModelConfig) -> nn.Module: diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py index 28a0aa772d84d..ab9a1636ad13f 100644 --- a/vllm/model_executor/models/__init__.py +++ b/vllm/model_executor/models/__init__.py @@ -1,41 +1,82 @@ -from vllm.model_executor.models.aquila import AquilaForCausalLM -from vllm.model_executor.models.baichuan import (BaiChuanForCausalLM, - BaichuanForCausalLM) -from vllm.model_executor.models.bloom import BloomForCausalLM -from vllm.model_executor.models.falcon import FalconForCausalLM -from vllm.model_executor.models.gpt2 import GPT2LMHeadModel -from vllm.model_executor.models.gpt_bigcode import GPTBigCodeForCausalLM -from vllm.model_executor.models.gpt_j import GPTJForCausalLM -from vllm.model_executor.models.gpt_neox import GPTNeoXForCausalLM -from vllm.model_executor.models.internlm import InternLMForCausalLM -from vllm.model_executor.models.llama import LlamaForCausalLM -from vllm.model_executor.models.mistral import MistralForCausalLM -from vllm.model_executor.models.mixtral import MixtralForCausalLM -from vllm.model_executor.models.mpt import MPTForCausalLM -from vllm.model_executor.models.opt import OPTForCausalLM -from vllm.model_executor.models.phi_1_5 import PhiForCausalLM -from vllm.model_executor.models.qwen import QWenLMHeadModel -from vllm.model_executor.models.chatglm import ChatGLMForCausalLM -from vllm.model_executor.models.yi import YiForCausalLM +import importlib +from typing import List, Optional, Type + +import torch.nn as nn + +from vllm.logger import init_logger +from vllm.utils import is_hip + +logger = init_logger(__name__) + +# Architecture -> (module, class). +_MODELS = { + "AquilaModel": ("aquila", "AquilaForCausalLM"), + "AquilaForCausalLM": ("aquila", "AquilaForCausalLM"), # AquilaChat2 + "BaiChuanForCausalLM": ("baichuan", "BaiChuanForCausalLM"), # baichuan-7b + "BaichuanForCausalLM": ("baichuan", "BaichuanForCausalLM"), # baichuan-13b + "BloomForCausalLM": ("bloom", "BloomForCausalLM"), + "ChatGLMModel": ("chatglm", "ChatGLMForCausalLM"), + "ChatGLMForConditionalGeneration": ("chatglm", "ChatGLMForCausalLM"), + "FalconForCausalLM": ("falcon", "FalconForCausalLM"), + "GPT2LMHeadModel": ("gpt2", "GPT2LMHeadModel"), + "GPTBigCodeForCausalLM": ("gpt_bigcode", "GPTBigCodeForCausalLM"), + "GPTJForCausalLM": ("gpt_j", "GPTJForCausalLM"), + "GPTNeoXForCausalLM": ("gpt_neox", "GPTNeoXForCausalLM"), + "InternLMForCausalLM": ("internlm", "InternLMForCausalLM"), + "LlamaForCausalLM": ("llama", "LlamaForCausalLM"), + # For decapoda-research/llama-* + "LLaMAForCausalLM": ("llama", "LlamaForCausalLM"), + "MistralForCausalLM": ("mistral", "MistralForCausalLM"), + "MixtralForCausalLM": ("mixtral", "MixtralForCausalLM"), + # transformers's mpt class has lower case + "MptForCausalLM": ("mpt", "MPTForCausalLM"), + "MPTForCausalLM": ("mpt", "MPTForCausalLM"), + "OPTForCausalLM": ("opt", "OPTForCausalLM"), + "PhiForCausalLM": ("phi_1_5", "PhiForCausalLM"), + "QWenLMHeadModel": ("qwen", "QWenLMHeadModel"), + "RWForCausalLM": ("falcon", "FalconForCausalLM"), + "YiForCausalLM": ("yi", "YiForCausalLM"), +} + +# Models not supported by ROCm. +_ROCM_UNSUPPORTED_MODELS = [] + +# Models partially supported by ROCm. +# Architecture -> Reason. +_ROCM_PARTIALLY_SUPPORTED_MODELS = { + "MistralForCausalLM": + "Sliding window attention is not yet supported in ROCm's flash attention", + "MixtralForCausalLM": + "Sliding window attention is not yet supported in ROCm's flash attention", +} + + +class ModelRegistry: + + @staticmethod + def load_model_cls(model_arch: str) -> Optional[Type[nn.Module]]: + if model_arch not in _MODELS: + return None + if is_hip(): + if model_arch in _ROCM_UNSUPPORTED_MODELS: + raise ValueError( + f"Model architecture {model_arch} is not supported by " + "ROCm for now.") + if model_arch in _ROCM_PARTIALLY_SUPPORTED_MODELS: + logger.warning( + f"Model architecture {model_arch} is partially supported " + "by ROCm: " + _ROCM_PARTIALLY_SUPPORTED_MODELS[model_arch]) + + module_name, model_cls_name = _MODELS[model_arch] + module = importlib.import_module( + f"vllm.model_executor.models.{module_name}") + return getattr(module, model_cls_name, None) + + @staticmethod + def get_supported_archs() -> List[str]: + return list(_MODELS.keys()) + __all__ = [ - "AquilaForCausalLM", - "BaiChuanForCausalLM", - "BaichuanForCausalLM", - "BloomForCausalLM", - "ChatGLMForCausalLM", - "FalconForCausalLM", - "GPT2LMHeadModel", - "GPTBigCodeForCausalLM", - "GPTJForCausalLM", - "GPTNeoXForCausalLM", - "InternLMForCausalLM", - "LlamaForCausalLM", - "MPTForCausalLM", - "OPTForCausalLM", - "PhiForCausalLM", - "QWenLMHeadModel", - "MistralForCausalLM", - "MixtralForCausalLM", - "YiForCausalLM", + "ModelRegistry", ] diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py index 3021ced88d074..b11e3713fd4da 100644 --- a/vllm/model_executor/models/mixtral.py +++ b/vllm/model_executor/models/mixtral.py @@ -31,21 +31,11 @@ from torch import nn from transformers import MixtralConfig -try: - import megablocks.ops as ops -except ImportError: - print( - "MegaBlocks not found. Please install it by `pip install megablocks`.") -try: - import stk -except ImportError: - print( - "STK not found: please see https://github.com/stanford-futuredata/stk") - from vllm.model_executor.input_metadata import InputMetadata from vllm.model_executor.layers.attention import PagedAttention from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import (LinearMethodBase, + ReplicatedLinear, QKVParallelLinear, RowParallelLinear) from vllm.model_executor.layers.rotary_embedding import get_rope @@ -65,8 +55,134 @@ KVCache = Tuple[torch.Tensor, torch.Tensor] -def promote_scalar(x: torch.Tensor) -> torch.Tensor: - return x.view(1) if len(x.size()) == 0 else x +class MixtralMLP(nn.Module): + + def __init__( + self, + num_experts: int, + hidden_size: int, + intermediate_size: int, + linear_method: Optional[LinearMethodBase] = None, + ) -> None: + super().__init__() + self.num_experts = num_experts + self.ffn_dim = intermediate_size + self.hidden_dim = hidden_size + + self.w1 = ReplicatedLinear(self.hidden_dim, + self.ffn_dim, + bias=False, + linear_method=linear_method) + self.w2 = ReplicatedLinear(self.ffn_dim, + self.hidden_dim, + bias=False, + linear_method=linear_method) + self.w3 = ReplicatedLinear(self.hidden_dim, + self.ffn_dim, + bias=False, + linear_method=linear_method) + + # TODO: Use vllm's SiluAndMul + self.act_fn = nn.SiLU() + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + w1_out, _ = self.w1(hidden_states) + w1_out = self.act_fn(w1_out) + w3_out, _ = self.w3(hidden_states) + current_hidden_states = w1_out * w3_out + current_hidden_states, _ = self.w2(current_hidden_states) + return current_hidden_states + + +class DummyModule(nn.Module): + + def __init__(self) -> None: + super().__init__() + + self.w1 = nn.Linear(0, 0, bias=False) + self.w2 = nn.Linear(0, 0, bias=False) + self.w3 = nn.Linear(0, 0, bias=False) + + set_weight_attrs(self.w1.weight, + {"weight_loader": self.dummy_weight_loader}) + set_weight_attrs(self.w2.weight, + {"weight_loader": self.dummy_weight_loader}) + set_weight_attrs(self.w3.weight, + {"weight_loader": self.dummy_weight_loader}) + + def forward(self, *args, **kwargs) -> None: + raise NotImplementedError() + + def dummy_weight_loader(self, *args, **kwargs) -> None: # pylint: disable=unused-argument + # Noop + return + + +class MixtralMoE(nn.Module): + + def __init__( + self, + config: MixtralConfig, + linear_method: Optional[LinearMethodBase] = None, + ): + super().__init__() + self.config = config + self.rank = get_tensor_model_parallel_rank() + self.tp_size = get_tensor_model_parallel_world_size() + self.num_total_experts = config.num_local_experts + self.top_k = config.num_experts_per_tok + if self.tp_size > self.num_total_experts: + raise ValueError( + f"Tensor parallel size {self.tp_size} is greater than " + f"the number of experts {self.num_total_experts}.") + # Split experts equally between ranks + self.expert_indicies = np.array_split(range( + self.num_total_experts), self.tp_size)[self.rank].tolist() + if not self.expert_indicies: + raise ValueError( + f"Rank {self.rank} has no experts assigned to it.") + + self.experts = nn.ModuleList([ + MixtralMLP(self.num_total_experts, + config.hidden_size, + config.intermediate_size, + linear_method=linear_method) + if idx in self.expert_indicies else DummyModule() + for idx in range(self.num_total_experts) + ]) + self.gate = ReplicatedLinear(config.hidden_size, + self.num_total_experts, + bias=False, + linear_method=linear_method) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + batch_size, sequence_length, hidden_dim = hidden_states.shape + hidden_states = hidden_states.view(-1, hidden_dim) + # router_logits: (batch * sequence_length, n_experts) + router_logits, _ = self.gate(hidden_states) + + routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float) + routing_weights, selected_experts = torch.topk(routing_weights, + self.top_k, + dim=-1) + routing_weights /= routing_weights.sum(dim=-1, keepdim=True) + + final_hidden_states = None + for expert_idx in self.expert_indicies: + expert_layer = self.experts[expert_idx] + expert_mask = (selected_experts == expert_idx) + expert_weights = (routing_weights * expert_mask).sum(dim=-1, + keepdim=True) + + current_hidden_states = expert_layer(hidden_states).mul_( + expert_weights) + if final_hidden_states is None: + final_hidden_states = current_hidden_states + else: + final_hidden_states.add_(current_hidden_states) + + return tensor_model_parallel_all_reduce(final_hidden_states).view( + batch_size, sequence_length, hidden_dim) class MixtralAttention(nn.Module): @@ -77,6 +193,7 @@ def __init__(self, num_kv_heads: int, max_position: int = 4096 * 32, rope_theta: float = 10000, + linear_method: Optional[LinearMethodBase] = None, sliding_window: Optional[int] = None) -> None: super().__init__() self.hidden_size = hidden_size @@ -101,24 +218,26 @@ def __init__(self, self.rope_theta = rope_theta self.sliding_window = sliding_window - self.wqkv = QKVParallelLinear( + self.qkv_proj = QKVParallelLinear( hidden_size, self.head_dim, self.total_num_heads, self.total_num_kv_heads, bias=False, + linear_method=linear_method, ) - self.wo = RowParallelLinear( + self.o_proj = RowParallelLinear( self.total_num_heads * self.head_dim, hidden_size, bias=False, + linear_method=linear_method, ) self.rotary_emb = get_rope( self.head_dim, rotary_dim=self.head_dim, max_position=max_position, base=int(self.rope_theta), - is_neox_style=False, # weights not in HF format + is_neox_style=True, ) self.attn = PagedAttention( self.num_heads, @@ -136,310 +255,74 @@ def forward( input_metadata: InputMetadata, cache_event: Optional[torch.cuda.Event], ) -> torch.Tensor: - qkv, _ = self.wqkv(hidden_states) + qkv, _ = self.qkv_proj(hidden_states) q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) q, k = self.rotary_emb(positions, q, k) k_cache, v_cache = kv_cache attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata, cache_event) - output, _ = self.wo(attn_output) + output, _ = self.o_proj(attn_output) return output -class BlockSparseMoE(nn.Module): - """ - Built on the paper and library Megablocks as described in - https://arxiv.org/abs/2211.15841. This implementation is - strictly equivalent to standard MoE with full capacity (no - dropped tokens). It's faster since it formulates MoE operations - in terms of block-sparse operations to accomodate imbalanced - assignments of tokens to experts, whereas standard MoE either - (1) drop tokens at the cost of reduced performance or (2) set - capacity factor to number of experts and thus waste computation - and memory on padding. - """ - - def __init__(self, hidden_dim: int, ffn_dim: int, num_experts: int, - top_k: int): - super().__init__() - self.hidden_dim = hidden_dim - self.ffn_dim = ffn_dim - self.num_experts = num_experts - self.top_k = top_k - - # gating - self.gate = nn.Linear(self.hidden_dim, - self.num_experts, - bias=False, - device=torch.cuda.current_device()) - - tp_size = get_tensor_model_parallel_world_size() - assert self.ffn_dim % tp_size == 0 - self.ffn_dim_per_partition = self.ffn_dim // tp_size - # merged expert weights, all of size (ffn_dim * n_experts, model_dim) - self.w1 = nn.Parameter( - torch.empty(self.ffn_dim_per_partition * self.num_experts, - self.hidden_dim, - device=torch.cuda.current_device())) - set_weight_attrs(self.w1, {"weight_loader": self.moe_weight_loader}) - self.w2 = nn.Parameter( - torch.empty(self.ffn_dim_per_partition * self.num_experts, - self.hidden_dim, - device=torch.cuda.current_device())) - set_weight_attrs(self.w2, {"weight_loader": self.moe_weight_loader}) - self.w3 = nn.Parameter( - torch.empty(self.ffn_dim_per_partition * self.num_experts, - self.hidden_dim, - device=torch.cuda.current_device())) - set_weight_attrs(self.w3, {"weight_loader": self.moe_weight_loader}) - - # Calculate the number of bits needed to represent the expert indices - # so that we can pass it to radix sort. - self.sort_end_bit = max(int(np.ceil(np.log2(self.num_experts))), 1) - self.blocking = 128 - self.quantize_scatter_num_bits = -1 - - # Calculate the number of bits needed to represent the column indices - # in the intermediate sparse matrix. - max_column_index = (self.ffn_dim * self.num_experts) // self.blocking - self.transpose_sort_end_bit = max( - int(np.ceil(np.log2(max_column_index))), 1) - - def moe_weight_loader(self, param: nn.Parameter, - loaded_weight: torch.Tensor) -> None: - """ - Load the weights for the MoE linear layer. - """ - tp_rank = get_tensor_model_parallel_rank() - shard_size = self.ffn_dim_per_partition - loaded_weight = loaded_weight.view(self.num_experts, self.ffn_dim, -1) - loaded_weight = loaded_weight[:, shard_size * tp_rank:shard_size * - (tp_rank + 1)] - loaded_weight = loaded_weight.reshape_as(param) - param.data.copy_(loaded_weight) - - def sparse_transpose( - self, size: int, row_indices, - column_indices) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - block_columns = size[1] // self.blocking - - # Sort row indices by column indices to get the transposed matrix's - # column indices. - # - # NOTE: Our sort operation uses the same width indices as the input - # values. To avoid overflow when we have large activation matrices - # we cast to 32-bit before sorting. - _, gather_indices = ops.sort(column_indices.int(), - self.transpose_sort_end_bit) - - # There are a constant number of blocks in every row of the sparse - # matrix. A blocks offset is: - # - # row_index * blocks_per_row + column_index % blocks_per_row - # - # Once we have the block offsets ordered for transposition we can - # divide by blocks_per_row to get the transposed column indices. - column_indices_t = row_indices.gather(0, gather_indices.long()) - block_offsets_t = gather_indices.int() - - zero = torch.zeros((1, ), dtype=torch.int32, device=row_indices.device) - nnz_per_column = ops.histogram(column_indices, block_columns) - nnz_per_column = ops.inclusive_cumsum(nnz_per_column, 0) - offsets_t = torch.cat([zero, nnz_per_column]) - return column_indices_t, offsets_t, block_offsets_t - - def topology(self, x: torch.Tensor, - padded_bins: torch.Tensor) -> "stk.Matrix": - padded_tokens, _ = x.size() - assert padded_tokens % self.blocking == 0 - assert self.ffn_dim_per_partition % self.blocking == 0 - - # Offsets for the sparse matrix. All rows have the - # same number of nonzero blocks dictated by the - # dimensionality of a single expert. - block_rows = padded_tokens // self.blocking - blocks_per_row = self.ffn_dim_per_partition // self.blocking - offsets = torch.arange( - 0, - block_rows * blocks_per_row + 1, - blocks_per_row, - dtype=torch.int32, - device=x.device, - ) - - # Indices for the sparse matrix. The indices for - # the intermediate matrix are dynamic depending - # on the mapping of tokens to experts. - column_indices = ops.topology(padded_bins, self.blocking, block_rows, - blocks_per_row) - - # TODO(tgale): This is unused. Remove the need for this in stk. - # For now, use meta init to save the device memory. - data = torch.empty( - column_indices.numel(), - self.blocking, - self.blocking, - dtype=x.dtype, - device="meta", - ) - shape = (padded_tokens, self.ffn_dim_per_partition * self.num_experts) - row_indices = stk.ops.row_indices(shape, data, offsets, column_indices) - column_indices_t, offsets_t, block_offsets_t = self.sparse_transpose( - shape, row_indices, column_indices) - return stk.Matrix( - shape, - data, - row_indices, - column_indices, - offsets, - column_indices_t, - offsets_t, - block_offsets_t, - ) - - def indices_and_padded_bins( - self, selected_experts: torch.Tensor - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, - torch.Tensor]: - # Sort the expert ids to produce the scatter/gather - # indices for the permutation. - selected_experts = selected_experts.int() - bin_ids, indices = ops.sort(selected_experts, self.sort_end_bit) - - # Histogram the expert ids to identify the number of - # tokens routed to each expert. - tokens_per_expert = ops.histogram(selected_experts, self.num_experts) - - # Round the token counts up to the block size used in - # the matrix muliplications. Caculate the starting - # position of each bin. - padded_tokens_per_expert = ops.round_up(tokens_per_expert, - self.blocking) - padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0) - padded_bins = promote_scalar(padded_bins) - - # Calculate the bin bounds for the sorted tokens. - bins = ops.inclusive_cumsum(tokens_per_expert, 0) - bins = promote_scalar(bins) - return indices, bin_ids, bins, padded_bins, tokens_per_expert - - @torch.inference_mode() - def forward(self, x: torch.Tensor) -> torch.Tensor: - """ - x: (sequence_length, model_dim) - gate_logits: (sequence_length, n_experts) - """ - # optional reshape - input_shape = x.shape - x = x.view(-1, input_shape[-1]) - - # gate_logits: (sequence_length, n_experts) - gate_logits = self.gate(x) - # all_probs: (sequence_length, n_experts) and upcast for softmax - all_probs = F.softmax(gate_logits, dim=1, dtype=torch.float) - # weights, selected_experts: (sequence_length, top-k) - weights, selected_experts = torch.topk(all_probs, self.top_k, dim=-1) - weights /= weights.sum(dim=-1, keepdim=True) - weights = weights.flatten().to(x.dtype) - selected_experts = selected_experts.flatten() - - (indices, bin_ids, bins, padded_bins, - _) = self.indices_and_padded_bins(selected_experts) - - # Permute tokens and pad to prepare expert computation - # (top_k * sequence_length + padding, model_dim) - x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, - self.top_k) - - # Create the sparse matrix topology - with torch.no_grad(): - topo = self.topology(x, padded_bins) - - # Perform the expert computation - # First Dense x Dense -> Sparse for w1 and w3, - # (top_k * sequence_length + padding, ffn_dim * n_experts) - x = stk.Matrix( - topo.size(), - F.silu(stk.ops.sdd(x, self.w1.t(), topo).data) * - stk.ops.sdd(x, self.w3.t(), topo).data, - topo.row_indices, - topo.column_indices, - topo.offsets, - topo.column_indices_t, - topo.offsets_t, - topo.block_offsets_t, - ) - - # Then Sparse x Dense -> Dense for w2 - # (top_k * sequence_length + padding, model_dim) - x = stk.ops.dsd(x, self.w2) - - x = tensor_model_parallel_all_reduce(x) - - # Permute back and remove padding - # (top_k * sequence_length, model_dim) - x = ops.padded_scatter( - x, - indices, - bin_ids, - weights, - bins, - padded_bins, - self.top_k, - self.quantize_scatter_num_bits, - ) - return x.view(*input_shape) - - class MixtralDecoderLayer(nn.Module): def __init__( self, config: MixtralConfig, + linear_method: Optional[LinearMethodBase] = None, ) -> None: super().__init__() self.hidden_size = config.hidden_size # Requires transformers > 4.32.0 rope_theta = getattr(config, "rope_theta", 10000) - self.attention = MixtralAttention( + self.self_attn = MixtralAttention( hidden_size=self.hidden_size, num_heads=config.num_attention_heads, max_position=config.max_position_embeddings, num_kv_heads=config.num_key_value_heads, rope_theta=rope_theta, - sliding_window=config.sliding_window) - self.block_sparse_moe = BlockSparseMoE( - hidden_dim=self.hidden_size, - ffn_dim=config.intermediate_size, - num_experts=config.num_local_experts, - top_k=config.num_experts_per_tok, - ) - self.attention_norm = RMSNorm(config.hidden_size, - eps=config.rms_norm_eps) - self.ffn_norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + sliding_window=config.sliding_window, + linear_method=linear_method) + self.block_sparse_moe = MixtralMoE(config=config, + linear_method=linear_method) + self.input_layernorm = RMSNorm(config.hidden_size, + eps=config.rms_norm_eps) + self.post_attention_layernorm = RMSNorm(config.hidden_size, + eps=config.rms_norm_eps) def forward( self, positions: torch.Tensor, - x: torch.Tensor, + hidden_states: torch.Tensor, kv_cache: KVCache, input_metadata: InputMetadata, cache_event: Optional[torch.cuda.Event], + residual: Optional[torch.Tensor], ) -> torch.Tensor: - r = self.attention( + # Self Attention + if residual is None: + residual = hidden_states + hidden_states = self.input_layernorm(hidden_states) + else: + hidden_states, residual = self.input_layernorm( + hidden_states, residual) + hidden_states = self.self_attn( positions=positions, - hidden_states=self.attention_norm(x), + hidden_states=hidden_states, kv_cache=kv_cache, input_metadata=input_metadata, cache_event=cache_event, ) - h = x + r - r = self.block_sparse_moe(self.ffn_norm(h)) - out = h + r - return out + # Fully Connected + hidden_states, residual = self.post_attention_layernorm( + hidden_states, residual) + hidden_states = self.block_sparse_moe(hidden_states) + return hidden_states, residual -class MixtralForCausalLM(nn.Module): + +class MixtralModel(nn.Module): def __init__( self, @@ -447,23 +330,18 @@ def __init__( linear_method: Optional[LinearMethodBase] = None, ) -> None: super().__init__() - self.config = config - assert linear_method is None self.padding_idx = config.pad_token_id self.vocab_size = config.vocab_size - self.tok_embeddings = VocabParallelEmbedding( + + self.embed_tokens = VocabParallelEmbedding( config.vocab_size, config.hidden_size, ) - - self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) - self.output = ParallelLMHead(config.vocab_size, config.hidden_size) - self.sampler = Sampler(config.vocab_size) - self.layers = nn.ModuleList([ - MixtralDecoderLayer(config) + MixtralDecoderLayer(config, linear_method=linear_method) for _ in range(config.num_hidden_layers) ]) + self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) def forward( self, @@ -473,20 +351,42 @@ def forward( input_metadata: InputMetadata, cache_events: Optional[List[torch.cuda.Event]], ) -> SamplerOutput: - hidden_states = self.tok_embeddings(input_ids) - - # forward + hidden_states = self.embed_tokens(input_ids) + residual = None for i in range(len(self.layers)): cache_event = None if cache_events is None else cache_events[i] layer = self.layers[i] - hidden_states = layer( - positions, - hidden_states, - kv_caches[i], - input_metadata, - cache_event, - ) - hidden_states = self.norm(hidden_states) + hidden_states, residual = layer(positions, hidden_states, + kv_caches[i], input_metadata, + cache_event, residual) + hidden_states, _ = self.norm(hidden_states, residual) + return hidden_states + + +class MixtralForCausalLM(nn.Module): + + def __init__( + self, + config: MixtralConfig, + linear_method: Optional[LinearMethodBase] = None, + ) -> None: + super().__init__() + self.config = config + self.linear_method = linear_method + self.model = MixtralModel(config, linear_method) + self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size) + self.sampler = Sampler(config.vocab_size) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: List[KVCache], + input_metadata: InputMetadata, + cache_events: Optional[List[torch.cuda.Event]], + ) -> torch.Tensor: + hidden_states = self.model(input_ids, positions, kv_caches, + input_metadata, cache_events) return hidden_states def sample( @@ -494,7 +394,7 @@ def sample( hidden_states: Optional[torch.Tensor], sampling_metadata: SamplingMetadata, ) -> SamplerOutput: - next_tokens = self.sampler(self.output.weight, hidden_states, + next_tokens = self.sampler(self.lm_head.weight, hidden_states, sampling_metadata) return next_tokens @@ -505,10 +405,11 @@ def load_weights(self, revision: Optional[str] = None): stacked_params_mapping = [ # (param_name, shard_name, shard_id) - ("wqkv", "wq", "q"), - ("wqkv", "wk", "k"), - ("wqkv", "wv", "v"), + ("qkv_proj", "q_proj", "q"), + ("qkv_proj", "k_proj", "k"), + ("qkv_proj", "v_proj", "v"), ] + params_dict = dict(self.named_parameters()) for name, loaded_weight in hf_model_weights_iterator( model_name_or_path, cache_dir, load_format, revision): diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 2209c994e2b83..03b71bbdefa04 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -134,14 +134,14 @@ def _prepare_decode( generation_token = seq_data.get_last_token_id() input_tokens.append([generation_token]) - context_len = seq_data.get_len() - if self.sliding_window is not None: - context_len = min(context_len, self.sliding_window) - context_lens.append(context_len) - - position = context_len - 1 + seq_len = seq_data.get_len() + position = seq_len - 1 input_positions.append([position]) + context_len = seq_len if self.sliding_window is None else min( + seq_len, self.sliding_window) + context_lens.append(context_len) + block_table = seq_group_metadata.block_tables[seq_id] block_number = block_table[position // self.block_size] block_offset = position % self.block_size