-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #10 from WukLab/zijian-dev
Zijian's development of resuming an API request after its call is completed by processing the API returned tokens using multiple single_query_cached_kv_attention. Currently, a few places are still written for testing only. Needs to change to run real models later.
- Loading branch information
Showing
19 changed files
with
567 additions
and
101 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -173,3 +173,6 @@ cython_debug/ | |
|
||
# Sphinx documentation | ||
_build/ | ||
|
||
ShareGPT_V3_unfiltered_cleaned_split.json | ||
*.nsys-rep |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
import pstats | ||
from pstats import SortKey | ||
|
||
p = pstats.Stats('prof_time.log') | ||
# p.strip_dirs().sort_stats('cumtime').print_stats('opt.py', 30) | ||
# p.strip_dirs().sort_stats('tottime').print_stats('opt.py:', 30) | ||
p.strip_dirs().sort_stats('cumtime').print_stats(30) | ||
# p.strip_dirs().sort_stats('tottime').print_stats(30) | ||
|
||
|
||
# p.strip_dirs().sort_stats('cumtime').print_callees('step') | ||
p.strip_dirs().sort_stats('cumtime').print_stats('llama.py') | ||
# p.strip_dirs().sort_stats('cumtime').print_stats('layernorm.py') | ||
# p.strip_dirs().sort_stats('cumtime').print_callees('forward') | ||
# p.strip_dirs().sort_stats('cumtime').print_stats('attention.py') | ||
p.strip_dirs().sort_stats('cumtime').print_stats('sampler.py') | ||
# p.strip_dirs().sort_stats('tottime').print_stats('sampler.py') | ||
# p.strip_dirs().sort_stats('cumtime').print_callees('_prune_hidden_states') | ||
# p.strip_dirs().sort_stats('cumtime').print_callees('_sample_from_generation_tokens') | ||
# p.strip_dirs().sort_stats('cumtime').print_callees('_sample_optimized') | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,57 @@ | ||
import argparse | ||
|
||
from vllm import EngineArgs, LLMEngine, SamplingParams, utils | ||
|
||
def api_call(input: str): | ||
return " a " | ||
|
||
def main(args: argparse.Namespace): | ||
# Parse the CLI argument and initialize the engine. | ||
engine_args = EngineArgs.from_cli_args(args) | ||
engine = LLMEngine.from_engine_args(engine_args) | ||
stop = [utils.get_api_stop_string()] | ||
# Test the following prompts. | ||
test_prompts = [ | ||
("The president of the United States is", | ||
SamplingParams(temperature=0.0, presence_penalty=0.2,stop=stop)), | ||
] | ||
|
||
# Run the engine by calling `engine.step()` manually. | ||
request_id = 0 | ||
# To test iteration-level scheduling, we add one request at each step. | ||
for prompt, sampling_params in test_prompts: | ||
engine.add_request(str(request_id), prompt, sampling_params) | ||
request_id += 1 | ||
|
||
request_outputs = engine.step() | ||
|
||
for request_output in request_outputs: | ||
print(request_output) | ||
if request_output.paused: | ||
response = {} | ||
for rs in request_output.paused: | ||
rid, sid = rs | ||
ret = api_call(request_output.outputs[rid].text) | ||
response[sid] = [10] | ||
engine.new_resume_request(request_output.request_id, response) | ||
print(engine.scheduler.running[0].seqs[0].data) | ||
|
||
# for _ in range(2): | ||
# request_outputs = engine.step() | ||
# for request_output in request_outputs: | ||
# print(request_output) | ||
|
||
while True: | ||
request_outputs = engine.step() | ||
for request_output in request_outputs: | ||
if request_output.finished: | ||
print(request_output) | ||
if not engine.has_unfinished_requests(): | ||
break | ||
|
||
if __name__ == '__main__': | ||
parser = argparse.ArgumentParser( | ||
description='Demo on using the LLMEngine class directly') | ||
parser = EngineArgs.add_cli_args(parser) | ||
args = parser.parse_args() | ||
main(args) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
If ray package raises this error: | ||
AttributeError: 'NoneType' object has no attribute 'fs' | ||
DO: | ||
pip install ray==2.5.1 | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.