forked from LLaVA-VL/LLaVA-NeXT
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtest_llava_interleave.py
252 lines (208 loc) · 12.2 KB
/
test_llava_interleave.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
# from .demo_modelpart import InferenceDemo
import gradio as gr
import os
# import time
import cv2
# import copy
import torch
# import random
import numpy as np
from llava import conversation as conversation_lib
from llava.constants import DEFAULT_IMAGE_TOKEN
from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
from llava.conversation import conv_templates, SeparatorStyle
from llava.model.builder import load_pretrained_model
from llava.utils import disable_torch_init
from llava.mm_utils import tokenizer_image_token, get_model_name_from_path, KeywordsStoppingCriteria
from PIL import Image
import requests
from PIL import Image
from io import BytesIO
from transformers import TextStreamer
class InferenceDemo(object):
def __init__(self,args,model_path,tokenizer, model, image_processor, context_len) -> None:
disable_torch_init()
self.tokenizer, self.model, self.image_processor, self.context_len = tokenizer, model, image_processor, context_len
if "llama-2" in model_name.lower():
conv_mode = "llava_llama_2"
elif "v1" in model_name.lower():
conv_mode = "llava_v1"
elif "mpt" in model_name.lower():
conv_mode = "mpt"
elif 'qwen' in model_name.lower():
conv_mode = "qwen_1_5"
else:
conv_mode = "llava_v0"
if args.conv_mode is not None and conv_mode != args.conv_mode:
print("[WARNING] the auto inferred conversation mode is {}, while `--conv-mode` is {}, using {}".format(conv_mode, args.conv_mode, args.conv_mode))
else:
args.conv_mode = conv_mode
self.conv_mode=conv_mode
self.conversation = conv_templates[args.conv_mode].copy()
self.num_frames = args.num_frames
def is_valid_video_filename(name):
video_extensions = ['avi', 'mp4', 'mov', 'mkv', 'flv', 'wmv', 'mjpeg']
ext = name.split('.')[-1].lower()
if ext in video_extensions:
return True
else:
return False
def sample_frames(video_file, num_frames) :
video = cv2.VideoCapture(video_file)
total_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
interval = total_frames // num_frames
frames = []
for i in range(total_frames):
ret, frame = video.read()
pil_img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
if not ret:
continue
if i % interval == 0:
frames.append(pil_img)
video.release()
return frames
def load_image(image_file):
if image_file.startswith("http") or image_file.startswith("https"):
response = requests.get(image_file)
if response.status_code == 200:
image = Image.open(BytesIO(response.content)).convert("RGB")
else:
print('failed to load the image')
else:
print('Load image from local file')
print(image_file)
image = Image.open(image_file).convert("RGB")
return image
def clear_history(history):
our_chatbot.conversation = conv_templates[our_chatbot.conv_mode].copy()
return None
def clear_response(history):
for index_conv in range(1, len(history)):
# loop until get a text response from our model.
conv = history[-index_conv]
if not (conv[0] is None):
break
question = history[-index_conv][0]
history = history[:-index_conv]
return history, question
def print_like_dislike(x: gr.LikeData):
print(x.index, x.value, x.liked)
def add_message(history, message):
# history=[]
global our_chatbot
if len(history)==0:
our_chatbot = InferenceDemo(args,model_path,tokenizer, model, image_processor, context_len)
for x in message["files"]:
history.append(((x,), None))
if message["text"] is not None:
history.append((message["text"], None))
return history, gr.MultimodalTextbox(value=None, interactive=False)
## format of history
## example: 4 images, 1 text
## history = [
# [(image_path1, ), None], [(image_path2, ), None], [(image_path2, ), None], [(image_path3, ), None], ['text strings', None]
# ]
## more specifically, each part of history is a tuple of (input(image or text), output(text str)). for the last example, if the model
## outputs a text like "xxxxxxx", and I continue typing, history will be:
## history = [
# [(image_path1, ), None], [(image_path2, ), None], [(image_path2, ), None], [(image_path3, ), None], ['text strings', 'xxxxxxx'], ['new_text strings', None]
# ]
# so one could do this again and again;
# note that it may take a lot GPU memory, and may raise OOM errors,
# a real example of history is:
# history [[('/tmp/gradio/58ac8930cf5058ce6182b7e987d0600474de3e47/iphone-15-price-1024x576.jpg',), None],
# [('/tmp/gradio/039a977e49baaa24222d5d6533ffc4d555a1b5b6/dynamic-island-1024x576.jpg',), None],
# [('/tmp/gradio/c0685d0f5df1b9ddbd6f6e0d1b7ba2f4cd58b5e0/iphone-15-colors-1024x576.jpg',), None],
# [('/tmp/gradio/6d30d311aea2b930e68472d17cd5c4c992cfb5da/Iphone-15-Usb-c-charger-1024x576.jpg',), None],
# [('/tmp/gradio/c32138aed8de4af3a03c4f1e04bfd7baea8fc32b/A-17-processors-1024x576.jpg',), None],
# ['The images are the PPT of iPhone 15 review. can you summarize the main information?', 'The images are from a presentation slide about the iPhone 15, which is an innovative device with a glass back and aluminum enclosure. It has a dynamic island feature that allows for quick access to flight information and delivery tracking. The phone comes in five amazing colors: Queen Pink, Pearl, Pastel Gray, Light Gray, and Onyx. It also has a USB-C compatible charger that can charge multiple devices at once. The iPhone 15 features powerful processors, including Apple A16 Bionic and Apple A17 Pro.'],
# ['again?', 'I apologize for the confusion. The images are from a presentation slide about the iPhone 15, which is an innovative device with a glass back and aluminum enclosure. It has a dynamic island feature that allows for quick access to flight information and delivery tracking. The phone comes in five amazing colors: Queen Pink, Pearl, Pastel Gray, Light Gray, and Onyx. It also has a USB-C compatible charger that can charge multiple devices at once. The iPhone 15 features powerful processors, including Apple A16 Bionic and Apple A17 Pro.'],
# ['more?', None]]
def bot(history):
text=history[-1][0]
images_this_term=[]
text_this_term=''
# import pdb;pdb.set_trace()
num_new_images = 0
for i,message in enumerate(history[:-1]):
if type(message[0]) is tuple:
images_this_term.append(message[0][0])
if is_valid_video_filename(message[0][0]):
num_new_images+=our_chatbot.num_frames
else:
num_new_images+=1
else:
num_new_images=0
# for message in history[-i-1:]:
# images_this_term.append(message[0][0])
assert len(images_this_term)>0, "must have an image"
# image_files = (args.image_file).split(',')
# image = [load_image(f) for f in images_this_term if f]
image_list=[]
for f in images_this_term:
if is_valid_video_filename(f):
image_list+=sample_frames(f, our_chatbot.num_frames)
else:
image_list.append(load_image(f))
image_tensor = [our_chatbot.image_processor.preprocess(f, return_tensors="pt")["pixel_values"][0].half().to(our_chatbot.model.device) for f in image_list]
image_tensor = torch.stack(image_tensor)
image_token = DEFAULT_IMAGE_TOKEN*num_new_images
# if our_chatbot.model.config.mm_use_im_start_end:
# inp = DEFAULT_IM_START_TOKEN + image_token + DEFAULT_IM_END_TOKEN + "\n" + inp
# else:
inp=text
inp = image_token+ "\n" + inp
our_chatbot.conversation.append_message(our_chatbot.conversation.roles[0], inp)
# image = None
our_chatbot.conversation.append_message(our_chatbot.conversation.roles[1], None)
prompt = our_chatbot.conversation.get_prompt()
input_ids = tokenizer_image_token(prompt, our_chatbot.tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to(our_chatbot.model.device)
stop_str = our_chatbot.conversation.sep if our_chatbot.conversation.sep_style != SeparatorStyle.TWO else our_chatbot.conversation.sep2
keywords = [stop_str]
stopping_criteria = KeywordsStoppingCriteria(keywords, our_chatbot.tokenizer, input_ids)
streamer = TextStreamer(our_chatbot.tokenizer, skip_prompt=True, skip_special_tokens=True)
# import pdb;pdb.set_trace()
with torch.inference_mode():
output_ids = our_chatbot.model.generate(input_ids, images=image_tensor, do_sample=True, temperature=0.2, max_new_tokens=1024, streamer=streamer, use_cache=False, stopping_criteria=[stopping_criteria])
outputs = our_chatbot.tokenizer.decode(output_ids[0]).strip()
if outputs.endswith(stop_str):
outputs = outputs[:-len(stop_str)]
our_chatbot.conversation.messages[-1][-1] = outputs
history[-1]=[text,outputs]
return history
if __name__ == "__main__":
import argparse
argparser = argparse.ArgumentParser()
argparser.add_argument("--server_name", default="0.0.0.0", type=str)
argparser.add_argument("--port", default="6123", type=str)
argparser.add_argument("--model_path", default="lmms-lab/llava-next-interleave-qwen-7b", type=str)
argparser.add_argument("--model-base", type=str, default=None)
argparser.add_argument("--num-gpus", type=int, default=1)
argparser.add_argument("--conv-mode", type=str, default=None)
argparser.add_argument("--temperature", type=float, default=0.2)
argparser.add_argument("--max-new-tokens", type=int, default=512)
argparser.add_argument("--num_frames", type=int, default=16)
# argparser.add_argument("--load-8bit", action="store_true")
# argparser.add_argument("--load-4bit", action="store_true")
argparser.add_argument("--load-8bit", type=int, default=1)
argparser.add_argument("--load-4bit", action="store_true")
argparser.add_argument("--debug", action="store_true")
args = argparser.parse_args()
model_path = args.model_path
filt_invalid="cut"
model_name = get_model_name_from_path(args.model_path)
tokenizer, model, image_processor, context_len = load_pretrained_model(args.model_path, args.model_base, model_name, args.load_8bit, args.load_4bit)
our_chatbot = InferenceDemo(args,model_path,tokenizer, model, image_processor, context_len)
history = [
[('/tmp/gradio/58ac8930cf5058ce6182b7e987d0600474de3e47/iphone-15-price-1024x576.jpg',), None],
[('/tmp/gradio/039a977e49baaa24222d5d6533ffc4d555a1b5b6/dynamic-island-1024x576.jpg',), None],
[('/tmp/gradio/c0685d0f5df1b9ddbd6f6e0d1b7ba2f4cd58b5e0/iphone-15-colors-1024x576.jpg',), None],
[('/tmp/gradio/6d30d311aea2b930e68472d17cd5c4c992cfb5da/Iphone-15-Usb-c-charger-1024x576.jpg',), None],
[('/tmp/gradio/c32138aed8de4af3a03c4f1e04bfd7baea8fc32b/A-17-processors-1024x576.jpg',), None],
['The images are the PPT of iPhone 15 review. can you summarize the main information?', 'The images are from a presentation slide about the iPhone 15, which is an innovative device with a glass back and aluminum enclosure. It has a dynamic island feature that allows for quick access to flight information and delivery tracking. The phone comes in five amazing colors: Queen Pink, Pearl, Pastel Gray, Light Gray, and Onyx. It also has a USB-C compatible charger that can charge multiple devices at once. The iPhone 15 features powerful processors, including Apple A16 Bionic and Apple A17 Pro.'],
['again?', 'I apologize for the confusion. The images are from a presentation slide about the iPhone 15, which is an innovative device with a glass back and aluminum enclosure. It has a dynamic island feature that allows for quick access to flight information and delivery tracking. The phone comes in five amazing colors: Queen Pink, Pearl, Pastel Gray, Light Gray, and Onyx. It also has a USB-C compatible charger that can charge multiple devices at once. The iPhone 15 features powerful processors, including Apple A16 Bionic and Apple A17 Pro.'],
['I want a more detailed description', None]
]
# print('old history', history)
history = bot(history)
# print('new history', history)