-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathsettings.py
105 lines (75 loc) · 3.47 KB
/
settings.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
DEBUG = False
STORY_DIR = "stories/my_story"
# EXECUTE_EXAMPLES = True
EXECUTE_EXAMPLES = False
TEXT_MODEL_BACKEND = "ollama"
# TEXT_MODEL_BACKEND = "nim"
# TEXT_MODEL = "microsoft/Phi-3.5-mini-instruct" # ~7 GB VRAM
# TEXT_MODEL = "meta-llama/Meta-Llama-3-8B-Instruct" # ~15.6 GB VRAM
# TEXT_MODEL = "meta-llama/Llama-3.1-8B-Instruct" # ~15.6 GB VRAM
# TEXT_MODEL = "meta-llama/Meta-Llama-3.1-8B-Instruct"
# TEXT_MODEL = "meta-llama/Llama-3.2-3B-Instruct"
# TEXT_MODEL = "llama3.1"
# TEXT_MODEL = "llama3.1:70b"
TEXT_MODEL = "nemotron:70b"
TEXT_CONTEXT_WINDOW = 100000 # 128k for llama3.1
# NIM
# TEXT_MODEL = "nvidia/llama-3.1-nemotron-70b-instruct"
# DEVICE_MAP = "cuda"
# DEVICE_MAP = "cpu"
DEVICE_MAP = "auto"
# TODO: Might want to do this for different models
import torch
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
# Temperature of 0 means deterministic output, while 1 means random output
TEMPERATURE = 0.0 # 0.6 is a good balance between randomness and coherence
# Top-p sampling to control output diversity (only used if temperature > 0)
# TOP_P = 0.9
TOP_P = 1
# Model that creates images from text
# Timestep-distilled - faster but has some limitations
# IMAGE_GENERATOR_MODEL = "black-forest-labs/FLUX.1-schnell" # About 33 GB VRAM?
# Guidence Distilled - focus on quality
IMAGE_GENERATOR_MODEL = "black-forest-labs/FLUX.1-dev" # About 33 GB VRAM?
# How many images to generate at once, depends on GPU memory and size of the images generated
IMAGE_GENERATION_BATCH_SIZE = 1
# Model that creates text from images to review whether the image was generated correctly
VISION_MODEL = "microsoft/Phi-3.5-vision-instruct" # Transformers
# VISION_MODEL = "meta-llama/Llama-3.2-11B-Vision" # Transformers
# VISION_MODEL = "meta-llama/Llama-3.2-11B-Vision-Instruct" # Transformers
# VISION_MODEL = "llama3.2-vision" # Ollama
# VISION_MODEL = "llama3.2-vision:90b" # Ollama
VISION_DEVICE_MAP = "auto"
MUSIC_MODEL = "facebook/musicgen-large"
# MUSIC_MODEL = "facebook/musicgen-medium"
# Model that creates short videos from images
VIDEO_GENERATOR_MODEL = "rain1011/pyramid-flow-sd3"
# ATTN_IMPLEMENTATION = None
ATTN_IMPLEMENTATION = "flash_attention_2"
# This might not be necessary
TOKENIZERS_PARALLELISM="true"
IMAGE_TO_VIDEO_MODEL = "THUDM/CogVideoX-5b-I2V"
# IMAGE_TO_VIDEO_MODEL = "ali-vilab/i2vgen-xl"
IMAGE_TO_VIDEO_QUANTIZED = False
IMAGE_TO_VIDEO_DYNAMIC_CFG = True
TEXT_TO_VIDEO_MODEL = 'THUDM/CogVideoX-5b'
TEXT_TO_VIDEO_QUANTIZED = False
# By leveraging dynamic CFG, you can potentially achieve better video quality and more accurate prompt adherence in your CogVideoX generations.
# However, dynamic CFG may also lead to slower inference times and higher memory usage.
if "i2vgen-xl" in IMAGE_TO_VIDEO_MODEL:
# Size for i2vgen-xl
CHARACTER_ANIMATION_WIDTH = 1280
CHARACTER_ANIMATION_HEIGHT = 704
else:
# Size for CogVideoX-5b-I2V
CHARACTER_ANIMATION_WIDTH = 720
CHARACTER_ANIMATION_HEIGHT = 480
# TTS_MODEL = "parler-tts/parler-tts-large-v1"
TTS_MODEL = "parler-tts/parler-tts-mini-v1"
TTS_REFERENCE_SPEECH = ("The birch canoe slid on the smooth planks. Glue the sheet to the dark blue background. "
"These days a chicken leg is a rare dish. Rice is often served in round bowls. "
"Help the woman get back to her feet.")
VOICE_CLONE_MODEL = "fish"
# CoquiTTS
# https://huggingface.co/docs/accelerate/en/usage_guides/model_size_estimator for estimating model size
# !export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True