-
Notifications
You must be signed in to change notification settings - Fork 34
/
maxperf.py
262 lines (216 loc) · 8.56 KB
/
maxperf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
import sys
import PIL
from PyQt5 import QtWidgets, QtCore
from PyQt5.QtWidgets import QApplication
from PyQt5.QtWidgets import QWidget, QSlider, QLabel, QLineEdit, QPushButton
from PyQt5.QtWidgets import QVBoxLayout, QHBoxLayout, QGridLayout
from PyQt5.QtGui import QPixmap, QImage, QColor, QPen, QFont, QPainter
from PyQt5.QtCore import Qt, QTimer, QEvent, pyqtSignal, QCoreApplication
import numpy as np
import torch
from diffusers import AutoPipelineForText2Image
from sfast.compilers.stable_diffusion_pipeline_compiler import (compile, CompilationConfig)
torch.set_grad_enabled(False)
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True
mw = None
batchSize = 10
prompts = ['Evil space kitty', 'Cute dog in hat, H.R. Giger style', 'Horse wearing a tie', 'Cartoon pig', 'Donkey on Mars', 'Cute kitties baked in a cake', 'Boxing chickens on farm, Maxfield Parish style', 'Future spaceship', 'A city of the past', 'Jabba the Hut wearing jewelery']
def dwencode(pipe, prompts, batchSize: int, nTokens: int):
tokenizer = pipe.tokenizer
text_encoder = pipe.text_encoder
if nTokens < 0 or nTokens > 75:
raise BaseException("n random tokens must be between 0 and 75")
if nTokens > 0:
randIIs = torch.randint(low=0, high=49405, size=(batchSize, nTokens), device='cuda')
text_inputs = tokenizer(
prompts,
padding = "max_length",
max_length = tokenizer.model_max_length,
truncation = True,
return_tensors = "pt",
).to('cuda')
tii = text_inputs.input_ids
if nTokens > 0:
for i in range(batchSize):
# Find the end mark which is deterimine the
# prompt len(pl) in terms of user tokens.
pl = (tii[i] == torch.tensor(49407, device='cuda')).nonzero()[0][0].item() - 1
tii[i][1+pl:1+pl+nTokens] = randIIs[i]
tii[i][1+pl+nTokens] = 49407
if False:
for bi in range(batchSize):
pl = (tii[i] == torch.tensor(49407, device='cuda')).nonzero()[0][0].item() - 1
print(f"{mw.seqno:05d}-{bi:02d}: ", end='')
for tid in tii[bi][1:1+pl+nTokens]:
print(f"{tokenizer.decode(tid)} ", end='')
print('')
prompt_embeds = text_encoder(tii.to('cuda'), attention_mask=None)
prompt_embeds = prompt_embeds[0]
prompt_embeds = prompt_embeds.to(dtype=pipe.unet.dtype, device='cuda')
bs_embed, seq_len, _ = prompt_embeds.shape
prompt_embeds = prompt_embeds.repeat(1, 1, 1)
prompt_embeds = prompt_embeds.view(bs_embed * 1, seq_len, -1)
return prompt_embeds
pipe = AutoPipelineForText2Image.from_pretrained(
"stabilityai/sd-turbo",
torch_dtype=torch.float16,
variant="fp16",
safety_checker=None,
requires_safety_checker=False
)
pipe.to("cuda")
#pipe.unet.to(memory_format=torch.channels_last)
from diffusers import AutoencoderTiny
pipe.vae = AutoencoderTiny.from_pretrained('madebyollin/taesd', torch_device='cuda', torch_dtype=torch.float16)
pipe.vae = pipe.vae.cuda()
pipe.set_progress_bar_config(disable=True)
if True:
config = CompilationConfig.Default()
# xformers and Triton are suggested for achieving best performance.
# It might be slow for Triton to generate, compile and fine-tune kernels.
try:
import xformers
config.enable_xformers = True
except ImportError:
print('xformers not installed, skip')
# NOTE:
# When GPU VRAM is insufficient or the architecture is too old, Triton might be slow.
# Disable Triton if you encounter this problem.
try:
import triton
config.enable_triton = True
except ImportError:
print('Triton not installed, skip')
# NOTE:
# CUDA Graph is suggested for small batch sizes and small resolutions to reduce CPU overhead.
# My implementation can handle dynamic shape with increased need for GPU memory.
# But when your GPU VRAM is insufficient or the image resolution is high,
# CUDA Graph could cause less efficient VRAM utilization and slow down the inference,
# especially when on Windows or WSL which has the "shared VRAM" mechanism.
# If you meet problems related to it, you should disable it.
config.enable_cuda_graph = True
if True:
config.enable_jit = True
config.enable_jit_freeze = True
config.trace_scheduler = True
config.enable_cnn_optimization = True
config.preserve_parameters = False
config.prefer_lowp_gemm = True
pipe = compile(pipe, config)
class MainWindow(QWidget):
def __init__(self):
super().__init__()
self.lasttm = time.time()
self.ii = 0
self.seqno = 0
self.stopped = True
font = QFont("Arial", 24)
self.fps = QLineEdit(self)
self.fps.setFixedWidth(176)
self.fps.setFont(font)
self.seed = QLineEdit(self)
self.seed.setText(" Ultra fast RTSD by Daniel Wood aka AIFartist")
self.seed.setFont(font)
self.go = QPushButton('Go', self)
self.step = QPushButton('Step', self)
self.stop = QPushButton('Stop', self)
self.nImgs = 10
# Create the image areas
self.imgs = []
for ii in range(self.nImgs):
self.imgs.append(QtWidgets.QLabel())
self.imgs[ii].setFixedSize(512, 512)
# Layout the widgets
layout = QVBoxLayout()
l2 = QHBoxLayout()
l2.addWidget(self.fps)
l2.addWidget(self.seed)
l2.addWidget(self.go)
l2.addWidget(self.step)
l2.addWidget(self.stop)
layout.addLayout(l2)
imgl = QGridLayout()
for ii in range(self.nImgs):
row = ii // 5
col = ii % 5
imgl.addWidget(self.imgs[ii], row, col)
layout.addLayout(imgl)
self.timer = QTimer(self)
self.timer.timeout.connect(self.do_event)
self.timer_interval = 0
self.setLayout(layout)
self.go.clicked.connect(self.do_go)
self.step.clicked.connect(self.do_step)
self.stop.clicked.connect(self.do_stop)
self.genImage()
def post_button_click_event(self):
event = QEvent(QEvent.Type(QEvent.MouseButtonPress))
QCoreApplication.postEvent(self.go, event)
def do_event(self):
if not self.stopped:
self.do_go()
def do_stop(self):
self.stopped = True
def do_step(self):
if self.stopped:
self.genImage()
def do_go(self):
global batchSize
self.stopped = False
self.genImage()
tm = time.time()
self.fps.setText(f"{(batchSize/(tm-self.lasttm)):5.1f} fps")
print(f"time={(1000.*(tm-self.lasttm)):3.1f}ms")
self.lasttm = tm
self.timer.start(self.timer_interval)
def genImage(self):
global prompts, batchSize
seed = random.randint(0, 2147483647)
torch.manual_seed(seed)
images = genit(0, prompts=prompts, batchSize=batchSize, nSteps=1)
for img in images:
imgData = img.tobytes('raw', 'RGB')
qImg = QImage(imgData, 512, 512, QImage.Format_RGB888)
pixmap = QPixmap.fromImage(qImg)
painter = QPainter(pixmap)
font = QFont()
font.setPointSize(32)
painter.setPen(QColor(255, 255, 0))
painter.setFont(font)
painter.drawText(24, 64, f"{self.seqno:4d}")
painter.end()
self.imgs[self.ii].setPixmap(pixmap)
self.ii += 1
if self.ii == self.nImgs:
self.ii = 0
self.seqno += 1
import time
import random
import torch
def genit(mode, prompts, batchSize, nSteps):
#tm0 = time.time()
pe = dwencode(pipe, prompts, batchSize, 9)
images = pipe(
prompt_embeds = pe,
width=512, height=512,
num_inference_steps = nSteps,
guidance_scale = 1,
output_type="pil",
return_dict=False
)[0]
#print(f"time = {(1000*(time.time() - tm0)):3.1f} milliseconds")
return images
if __name__ == '__main__':
if len(sys.argv) == 2:
batchSize = int(sys.argv[1])
if batchSize > 10:
print('Batchsize must not be greater than 10.')
prompts = prompts[:batchSize]
else:
batchSize = 10
prompts = ['Evil space kitty', 'Cute dog in hat, H.R. Giger style', 'Horse wearing a tie', 'Cartoon pig', 'Donkey on Mars', 'Cute kitties baked in a cake', 'Boxing chickens on farm, Maxfield Parish style', 'Future spaceship', 'A city of the past', 'Jabba the Hut wearing jewelery']
app = QApplication(sys.argv)
mw = MainWindow()
mw.show()
sys.exit(app.exec_())