forked from LLaVA-VL/LLaVA-NeXT
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtest_llava_interleave_mvcaption_scene.py
321 lines (266 loc) · 14.1 KB
/
test_llava_interleave_mvcaption_scene.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
# from .demo_modelpart import InferenceDemo
# import gradio as gr
import os
# import time
import cv2
import glob
import random
import time
# import copy
import torch
# import random
import numpy as np
from llava import conversation as conversation_lib
from llava.constants import DEFAULT_IMAGE_TOKEN
from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
from llava.conversation import conv_templates, SeparatorStyle
from llava.model.builder import load_pretrained_model
from llava.utils import disable_torch_init
from llava.mm_utils import tokenizer_image_token, get_model_name_from_path, KeywordsStoppingCriteria
from PIL import Image
import requests
from PIL import Image
from io import BytesIO
from transformers import TextStreamer
class InferenceDemo(object):
def __init__(self,args,model_path,tokenizer, model, image_processor, context_len) -> None:
disable_torch_init()
self.tokenizer, self.model, self.image_processor, self.context_len = tokenizer, model, image_processor, context_len
if "llama-2" in model_name.lower():
conv_mode = "llava_llama_2"
elif "v1" in model_name.lower():
conv_mode = "llava_v1"
elif "mpt" in model_name.lower():
conv_mode = "mpt"
elif 'qwen' in model_name.lower():
conv_mode = "qwen_1_5"
else:
conv_mode = "llava_v0"
if args.conv_mode is not None and conv_mode != args.conv_mode:
print("[WARNING] the auto inferred conversation mode is {}, while `--conv-mode` is {}, using {}".format(conv_mode, args.conv_mode, args.conv_mode))
else:
args.conv_mode = conv_mode
self.conv_mode=conv_mode
self.conversation = conv_templates[args.conv_mode].copy()
self.num_frames = args.num_frames
def is_valid_video_filename(name):
video_extensions = ['avi', 'mp4', 'mov', 'mkv', 'flv', 'wmv', 'mjpeg']
ext = name.split('.')[-1].lower()
if ext in video_extensions:
return True
else:
return False
def sample_frames(video_file, num_frames) :
video = cv2.VideoCapture(video_file)
total_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
interval = total_frames // num_frames
frames = []
for i in range(total_frames):
ret, frame = video.read()
pil_img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
if not ret:
continue
if i % interval == 0:
frames.append(pil_img)
video.release()
return frames
def load_image(image_file):
if image_file.startswith("http") or image_file.startswith("https"):
response = requests.get(image_file)
if response.status_code == 200:
image = Image.open(BytesIO(response.content)).convert("RGB")
else:
print('failed to load the image')
else:
print('Load image from local file')
print(image_file)
# image = Image.open(image_file).convert("RGB")
image = cv2.imread(image_file, cv2.IMREAD_UNCHANGED)
h, w, c = image.shape
if c == 4:
mask = image[..., 3:4] / 255.
image = image[..., :3] * mask + 255. * (1 - mask) # replace background as white
image = image[..., ::-1]
# image = cv2.resize(image, (64, 64), cv2.INTER_CUBIC)
image = Image.fromarray(image.astype(np.uint8))
return image
def clear_history(history):
our_chatbot.conversation = conv_templates[our_chatbot.conv_mode].copy()
return None
def clear_response(history):
for index_conv in range(1, len(history)):
# loop until get a text response from our model.
conv = history[-index_conv]
if not (conv[0] is None):
break
question = history[-index_conv][0]
history = history[:-index_conv]
return history, question
# def print_like_dislike(x: gr.LikeData):
# print(x.index, x.value, x.liked)
def add_message(history, message):
# history=[]
global our_chatbot
if len(history)==0:
our_chatbot = InferenceDemo(args,model_path,tokenizer, model, image_processor, context_len)
for x in message["files"]:
history.append(((x,), None))
if message["text"] is not None:
history.append((message["text"], None))
return history, gr.MultimodalTextbox(value=None, interactive=False)
## format of history
## example: 4 images, 1 text
## history = [
# [(image_path1, ), None], [(image_path2, ), None], [(image_path2, ), None], [(image_path3, ), None], ['text strings', None]
# ]
## more specifically, each part of history is a tuple of (input(image or text), output(text str)). for the last example, if the model
## outputs a text like "xxxxxxx", and I continue typing, history will be:
## history = [
# [(image_path1, ), None], [(image_path2, ), None], [(image_path2, ), None], [(image_path3, ), None], ['text strings', 'xxxxxxx'], ['new_text strings', None]
# ]
# so one could do this again and again;
# note that it may take a lot GPU memory, and may raise OOM errors,
# a real example of history is:
# history [[('/tmp/gradio/58ac8930cf5058ce6182b7e987d0600474de3e47/iphone-15-price-1024x576.jpg',), None],
# [('/tmp/gradio/039a977e49baaa24222d5d6533ffc4d555a1b5b6/dynamic-island-1024x576.jpg',), None],
# [('/tmp/gradio/c0685d0f5df1b9ddbd6f6e0d1b7ba2f4cd58b5e0/iphone-15-colors-1024x576.jpg',), None],
# [('/tmp/gradio/6d30d311aea2b930e68472d17cd5c4c992cfb5da/Iphone-15-Usb-c-charger-1024x576.jpg',), None],
# [('/tmp/gradio/c32138aed8de4af3a03c4f1e04bfd7baea8fc32b/A-17-processors-1024x576.jpg',), None],
# ['The images are the PPT of iPhone 15 review. can you summarize the main information?', 'The images are from a presentation slide about the iPhone 15, which is an innovative device with a glass back and aluminum enclosure. It has a dynamic island feature that allows for quick access to flight information and delivery tracking. The phone comes in five amazing colors: Queen Pink, Pearl, Pastel Gray, Light Gray, and Onyx. It also has a USB-C compatible charger that can charge multiple devices at once. The iPhone 15 features powerful processors, including Apple A16 Bionic and Apple A17 Pro.'],
# ['again?', 'I apologize for the confusion. The images are from a presentation slide about the iPhone 15, which is an innovative device with a glass back and aluminum enclosure. It has a dynamic island feature that allows for quick access to flight information and delivery tracking. The phone comes in five amazing colors: Queen Pink, Pearl, Pastel Gray, Light Gray, and Onyx. It also has a USB-C compatible charger that can charge multiple devices at once. The iPhone 15 features powerful processors, including Apple A16 Bionic and Apple A17 Pro.'],
# ['more?', None]]
def bot(history):
text=history[-1][0]
images_this_term=[]
text_this_term=''
# import pdb;pdb.set_trace()
num_new_images = 0
for i,message in enumerate(history[:-1]):
if type(message[0]) is tuple:
images_this_term.append(message[0][0])
if is_valid_video_filename(message[0][0]):
num_new_images+=our_chatbot.num_frames
else:
num_new_images+=1
else:
num_new_images=0
# for message in history[-i-1:]:
# images_this_term.append(message[0][0])
assert len(images_this_term)>0, "must have an image"
# image_files = (args.image_file).split(',')
# image = [load_image(f) for f in images_this_term if f]
image_list=[]
for f in images_this_term:
if is_valid_video_filename(f):
image_list+=sample_frames(f, our_chatbot.num_frames)
else:
image_list.append(load_image(f))
image_tensor = [our_chatbot.image_processor.preprocess(f, return_tensors="pt")["pixel_values"][0].half().to(our_chatbot.model.device) for f in image_list]
image_tensor = torch.stack(image_tensor)
image_token = DEFAULT_IMAGE_TOKEN*num_new_images
# if our_chatbot.model.config.mm_use_im_start_end:
# inp = DEFAULT_IM_START_TOKEN + image_token + DEFAULT_IM_END_TOKEN + "\n" + inp
# else:
inp=text
inp = image_token+ "\n" + inp
our_chatbot.conversation.append_message(our_chatbot.conversation.roles[0], inp)
# image = None
our_chatbot.conversation.append_message(our_chatbot.conversation.roles[1], None)
prompt = our_chatbot.conversation.get_prompt()
input_ids = tokenizer_image_token(prompt, our_chatbot.tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to(our_chatbot.model.device)
stop_str = our_chatbot.conversation.sep if our_chatbot.conversation.sep_style != SeparatorStyle.TWO else our_chatbot.conversation.sep2
keywords = [stop_str]
stopping_criteria = KeywordsStoppingCriteria(keywords, our_chatbot.tokenizer, input_ids)
streamer = TextStreamer(our_chatbot.tokenizer, skip_prompt=True, skip_special_tokens=True)
# import pdb;pdb.set_trace()
with torch.inference_mode():
output_ids = our_chatbot.model.generate(input_ids, images=image_tensor, do_sample=True, temperature=0.2, max_new_tokens=1024, streamer=streamer, use_cache=False, stopping_criteria=[stopping_criteria])
outputs = our_chatbot.tokenizer.decode(output_ids[0]).strip()
if outputs.endswith(stop_str):
outputs = outputs[:-len(stop_str)]
our_chatbot.conversation.messages[-1][-1] = outputs
history[-1]=[text,outputs]
return history, outputs
if __name__ == "__main__":
import argparse
argparser = argparse.ArgumentParser()
argparser.add_argument("--server_name", default="0.0.0.0", type=str)
argparser.add_argument("--port", default="6123", type=str)
argparser.add_argument("--model_path", default="lmms-lab/llava-next-interleave-qwen-7b", type=str)
argparser.add_argument("--model-base", type=str, default=None)
argparser.add_argument("--num-gpus", type=int, default=1)
argparser.add_argument("--conv-mode", type=str, default=None)
argparser.add_argument("--temperature", type=float, default=0.2)
argparser.add_argument("--max-new-tokens", type=int, default=512)
argparser.add_argument("--num_frames", type=int, default=16)
# argparser.add_argument("--load-8bit", action="store_true")
# argparser.add_argument("--load-4bit", action="store_true")
argparser.add_argument("--load-8bit", type=int, default=0)
argparser.add_argument("--load-4bit", type=int, default=1)
argparser.add_argument("--debug", action="store_true")
args = argparser.parse_args()
model_path = args.model_path
filt_invalid="cut"
model_name = get_model_name_from_path(args.model_path)
tokenizer, model, image_processor, context_len = load_pretrained_model(args.model_path, args.model_base, model_name, args.load_8bit, args.load_4bit)
our_chatbot = InferenceDemo(args,model_path,tokenizer, model, image_processor, context_len)
## the data folder structure format of the image files
## flat images in a folders
## first genrate local captions for each images, then gloal captions given random images
unzipped_tar_root = 'examples/realestate10k'
all_files = os.listdir(unzipped_tar_root)
all_images = [f for f in all_files if f.endswith(('.jpg', '.png')) and 'depth' not in f]
all_ids = sorted(list(set([file.split('.')[0] for file in all_images])))
DESCRIPTION_PROMPTS = [
'Please write a detailed image prompt for the entire scene.',
'Describe the colors, textures, shapes, and features of everything in the image, including the background and surroundings.',
'Please describe all the elements in the image, focusing on both objects and the overall scene.',
'Write a detailed image prompt for the scene.',
'Write an image prompt for the scene, less than 10 words.',
'Write an image prompt for the scene, less than 20 words.',
'Write an image prompt for the scene, less than 50 words.',
'Write an image prompt for the scene, less than 100 words.',
'Describe the setting, lighting, and mood of the scene along with the objects within it.'
]
all_ids = all_ids[:1]
global_input_image_num = 8
num_global_prompts = 1
for image_id in all_ids:
st = time.time()
image_paths = sorted([os.path.join(unzipped_tar_root, f) for f in all_images if image_id in f])
### 1. local caption for each image
for image_path in image_paths:
# random select a input prompt
input_prompt = np.random.choice(DESCRIPTION_PROMPTS)
print('Select a prompt:', input_prompt)
history = [
[(image_path,), None],
[input_prompt, None]
]
with torch.no_grad():
history, outputs = bot(history)
# remove quotation mark of captions
if outputs.startswith('"') and outputs.endswith('"'):
outputs = outputs[1:-1]
if outputs.startswith("'") and outputs.endswith("'"):
outputs = outputs[1:-1]
print(f'Local caption for {image_path}: {outputs}')
clear_history(history)
### 2. global caption given random images
for i in range(num_global_prompts):
global_image_paths = np.random.choice(image_paths, replace=False, size=global_input_image_num)
# random select a input prompt
input_prompt = np.random.choice(DESCRIPTION_PROMPTS)
print('Select a prompt:', input_prompt)
history = [[(global_image_path,), None] for global_image_path in global_image_paths]
history.append([input_prompt, None])
with torch.no_grad():
history, outputs = bot(history)
# remove quotation mark of captions
if outputs.startswith('"') and outputs.endswith('"'):
outputs = outputs[1:-1]
if outputs.startswith("'") and outputs.endswith("'"):
outputs = outputs[1:-1]
print(f'Global caption for {image_id}: {outputs}')
clear_history(history)
print(f'Time takes for {image_id}: {time.time() - st} seconds.')