-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathvila_features.py
executable file
·115 lines (92 loc) · 3.45 KB
/
vila_features.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
#!/usr/bin/env python3
import os
import argparse
import json
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # 2 show only errors #3 no output
os.environ["TFHUB_CACHE_DIR"] = os.path.join(os.path.dirname(__file__), "pretrained_weights/tfmodels")
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_models as tfm
tf.experimental.numpy.experimental_enable_numpy_behavior()
physical_devices = tf.config.list_physical_devices('GPU')
for x in physical_devices:
tf.config.experimental.set_memory_growth(x, True)
import numpy as np
import pandas as pd
from utils import video_frames
from utils import sample_non_uniform
from utils import prefix_dict
def vila_model():
# VILA model
model_handle = 'https://tfhub.dev/google/vila/image/1'
model = hub.load(model_handle)
predict_fn = model.signatures['serving_default']
output_key = "predictions"
def predict(png_str):
# VILA prediction
res = predict_fn(png_str)
return res[output_key]
return predict
def extract_features(video_path, frame_sampling=True):
# read video frames to gpu memory
frames = video_frames(video_path)
print(f"vila features: process {len(frames)} frames")
res = {}
models_fun = {
"vila": vila_model()
}
for model in models_fun:
# estimate model predictions
res[model] = tf.map_fn(
fn=lambda x: models_fun[model](
tf.io.encode_png(
tf.cast(
x,
tf.uint8
),
compression=0
),
),
elems=np.array(frames),
fn_output_signature=tf.float32
).numpy().flatten()
# estimate model cc predictions
res[model + "_cc"] = tf.map_fn(
fn=lambda x: models_fun[model](
tf.io.encode_png(
tf.cast(
#tf.image.resize(
tf.image.central_crop(x, 0.5), #(272, 272),
#preserve_aspect_ratio=True, antialias=False
#),
tf.uint8
),
compression=0
),
),
elems=np.array(frames),
fn_output_signature=tf.float32
).numpy().flatten()
# pool the features
df = pd.DataFrame(res)
mean = prefix_dict(df.mean().to_dict(), "mean_")
return mean
if __name__ == "__main__":
# argument parsing
parser = argparse.ArgumentParser(description='vila feature estimation',
epilog="stg7 2024",
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument("video", type=str, nargs="+", help="video to extract scores")
parser.add_argument("--features_folder", type=str, default="features_vila", help="only for calculate features, folder to store the features")
a = vars(parser.parse_args())
for video in a["video"]:
features = extract_features(video)
features["video"] = video
print(features)
featuresfile = os.path.join(
a["features_folder"], os.path.splitext(os.path.basename(video))[0] + ".json"
)
os.makedirs(a["features_folder"], exist_ok=True)
print(f"saving features in {featuresfile}")
with open(featuresfile, "w") as xfp:
json.dump(features, xfp, indent=4, sort_keys=True)