-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathweighting_strategy_part2.py
298 lines (221 loc) · 10.3 KB
/
weighting_strategy_part2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
from __future__ import print_function
import argparse
import torch
import torch.utils.data
from torch import nn, optim
from torch.nn import functional as F
from torchvision import datasets, transforms
from torchvision.utils import save_image
from torchvision import transforms
import pdb, os
import itertools
import pandas as pd
import torch.nn as nn
from dirtorch.utils import common
import dirtorch.nets as nets
import numpy as np
import h5py
from finch import FINCH
offset_s2s_category_ = 100
import pickle
# FINCH Clustering
def W_within_catergory(data_, cid_):
W_clustering_within_category = np.asarray([0, 1, 3, 4, 5]) + offset_s2s_category_ ;
W_ = {}
for ii_ in W_clustering_within_category:
temp_idx = np.where(cid_ == ii_)[0]
feat_ = data_[temp_idx, :]
c, num_clust, req_c = FINCH(feat_, initial_rank=None, req_clust=None, distance='cosine', verbose=False) #verbose=True
W_['{}'.format(ii_)] = []
W_['{}'.format(ii_)].append({'idx':temp_idx, 'c': c, 'nc':num_clust})
return W_
def FW_within_catergory(data_, cid_):
W_class_ = np.asarray([2, 6, 7, 8, 9, 10]) + offset_s2s_category_
FW_clustering_within_category_F_ = {0:[8, 9, 11], 1:[20], 2:[1,15], 3:[3,21], 4:[10,12], 5:[13, 14, 16, 17, 18, 19]}
FW_ = {}
for ii_ in range(len(W_class_)):
# S2S
temp_W_category = W_class_[ii_]
temp_W_idx = np.where(cid_ == temp_W_category)[0]
temp_W_feat = data_[temp_W_idx,:]
# DF
temp_F_categories = FW_clustering_within_category_F_[ii_]
temp_F_idx = []
for jj_ in temp_F_categories:
temp_idx_ = np.where(cid_ == jj_)[0]
temp_F_idx = np.concatenate((temp_F_idx,temp_idx_)).astype(int)
temp_F_feat = data_[temp_F_idx,:]
temp_F_W_idx = np.concatenate((temp_F_idx, temp_W_idx)).astype(int)
feat_ = np.concatenate((temp_F_feat, temp_W_feat))
c, num_clust, req_c = FINCH(feat_, initial_rank=None, req_clust=None, distance='cosine', verbose=False) #verbose=True
FW_['{}'.format(W_class_[ii_])] = []
FW_['{}'.format(W_class_[ii_])].append({'idx':temp_F_W_idx, 'c': c, 'nc':num_clust})
return FW_
def data_prep_per_category(all_c_pid_, all_c_cid_, all_s_pid_, all_s_cid_, FW_, W_, finch_layer_):
map_classes_F_to_W_ = {0:None, 1:107, 2:None, 3:108, 4:None, 5:None, 6:None, 7:None, 8:102, 9:102, 10:109, 11:102, 12:109, 13:110, 14:110, 15:107, 16:110, 17:110, 18:110, 19:110, 20:106, 21:108, 22:None}
add_new_classes_to_W_to_F_ = {100:23, 101:24, 103:25, 104:26, 105:27}
F_FW_W_ = {}
##############
#
# L3:W^{~}
#
##############
offset_pid_for_W_L3_ = len(np.unique(all_c_pid_)) + offset_s2s_category_
k_ = offset_pid_for_W_L3_
for W_cat_ in add_new_classes_to_W_to_F_:
# L3:W^{~}
F_FW_W_[add_new_classes_to_W_to_F_[W_cat_]] = []
temp_W_category_ = W_cat_
W_idx = W_['{}'.format(temp_W_category_)][0]['idx']
W_clust_id = W_['{}'.format(temp_W_category_)][0]['c'][:,finch_layer_]
for t_idx, clid_ in enumerate(W_clust_id):
# Shop
t_W_clid_ = np.where(W_clust_id == clid_)[0]
t_W_clid_idx_ = W_idx[t_W_clid_]
F_FW_W_[add_new_classes_to_W_to_F_[W_cat_]].append({'Ws':t_W_clid_idx_.tolist(), 'pid':k_})
k_ = k_ + 1
##############
#
# L1:F and L2:FW^{~}
#
##############
for F_cat_ in map_classes_F_to_W_:
# print(F_cat_)
F_FW_W_[F_cat_] = []
temp_W_category_ = map_classes_F_to_W_[F_cat_]
temp_F_category_idx_ = np.where(all_c_cid_ == F_cat_)[0]
temp_F_pids_ = all_c_pid_[temp_F_category_idx_]
temp_F_pids_unique_ = np.unique(temp_F_pids_)
if temp_W_category_ is not None:
# L2 : FW^{~}
FW_idx = FW_['{}'.format(temp_W_category_)][0]['idx']
FW_clust_id = FW_['{}'.format(temp_W_category_)][0]['c'][:,finch_layer_]
FW_pid = all_s_pid_[FW_idx]
for pid_ in temp_F_pids_unique_:
# Consumer
t_F_pid_idx = np.where(all_c_pid_ == pid_)[0]
# Shop
t_Fs_pid_idx = np.where(all_s_pid_ == pid_)[0]
# Shop: FW^{~}
temp_FW_pid_idx = np.where(FW_pid == pid_)[0]
temp_cluster_number_ = np.unique(FW_clust_id[temp_FW_pid_idx])
# if len(temp_cluster_number_) >1:
# pdb.set_trace()
t_FW_pid_idx = []
for cc_ in temp_cluster_number_:
temp_cluster_idx_ = np.where(FW_clust_id == cc_)[0]
temp_FW_all_pid_idx = FW_idx[temp_cluster_idx_]
# Only indexes obtained via clusters
temp_FW_diff_idx_ = np.setdiff1d(temp_FW_all_pid_idx, t_Fs_pid_idx)
temp_FW_diff_pid_ = all_s_pid_[temp_FW_diff_idx_]
# Excluding all other produdct ids from Fashion dataset, and only considering the S2S samples
final_FW_diff_idx_ = temp_FW_diff_idx_[temp_FW_diff_pid_ == -1]
if len(final_FW_diff_idx_) > 0:
t_FW_pid_idx.append(final_FW_diff_idx_.tolist())
t_FW_pid_idx = list(itertools.chain.from_iterable(t_FW_pid_idx))
if len(t_FW_pid_idx):
F_FW_W_[F_cat_].append({'Fc':t_F_pid_idx.tolist(), 'Fs':t_Fs_pid_idx.tolist(), 'FWs': t_FW_pid_idx, 'pid':pid_})
else:
F_FW_W_[F_cat_].append({'Fc':t_F_pid_idx.tolist(), 'Fs':t_Fs_pid_idx.tolist(), 'pid':pid_})
else:
for pid_ in temp_F_pids_unique_:
# Consumer
t_F_pid_idx = np.where(all_c_pid_ == pid_)[0]
# Shop
t_Fs_pid_idx = np.where(all_s_pid_ == pid_)[0]
F_FW_W_[F_cat_].append({'Fc':t_F_pid_idx.tolist(), 'Fs':t_Fs_pid_idx.tolist(), 'pid':pid_})
return F_FW_W_
def data_prep_all_category(all_c_pid_, all_s_pid_):
Fc_Fs_ = []
temp_F_pids_unique_ = np.unique(all_c_pid_)
for pid_ in temp_F_pids_unique_:
# Consumer
t_Fc_pid_idx = np.where(all_c_pid_ == pid_)[0]
# Shop
t_Fs_pid_idx = np.where(all_s_pid_ == pid_)[0]
Fc_Fs_.append({'Fc':t_Fc_pid_idx.tolist(), 'Fs':t_Fs_pid_idx.tolist(), 'pid':pid_})
return Fc_Fs_
def cluster_X_base(path_to_data_df_, path_to_data_s2s_, feat_epoch_num_, feat_type_):
########
#
# Shop Samples
#
########
data_s_df_ = h5py.File(os.path.join(path_to_data_df_,'train_Shop_{}_{}.h5'.format(feat_type_, feat_epoch_num_)),'r')
data_s_df_feat_ = np.asarray(data_s_df_['X_']) #-- BEFORE
# # TO COMMENT -- AFTER
# feat_path = h5py.File('/cvhci/data/fashion_NAVER/NAVER_Codes/deep-image-retrieval/kntorch/features/ClusteringFeats/CURR/DeepFashion/DeepFashion_ADAM_ALL_L12_0/train_Shop_X+1_0.h5','r')
# data_s_df_feat_ = np.asarray(feat_path['X_']) # -- AFTER
data_s_df_pid_ = np.asarray(data_s_df_['pid']).astype(int)
data_s_df_cid_ = np.asarray(data_s_df_['cid']).astype(int)
data_s_df_dataset_id_ = np.ones(len(data_s_df_cid_)).astype(int)
data_s_s2s_ = h5py.File(os.path.join(path_to_data_s2s_,'train_Shop_{}_{}.h5'.format(feat_type_, feat_epoch_num_)),'r')
data_s_s2s_feat_ = np.asarray(data_s_s2s_['X_']) #-- BEFORE
# # TO COMMENT -- AFTER
# feat_path = h5py.File('/cvhci/data/fashion_NAVER/NAVER_Codes/deep-image-retrieval/kntorch/features/ClusteringFeats/CURR/Street2Shop/DeepFashion_ADAM_ALL_L12_0/train_Shop_X+1_0.h5','r')
# data_s_s2s_feat_ = np.asarray(feat_path['X_']) # -- AFTER
# data_s_s2s_pid_ = np.asarray(data_s_s2s_['pid']).astype(int)
data_s_s2s_cid_ = np.asarray(data_s_s2s_['cid']).astype(int) + offset_s2s_category_
data_s_s2s_dataset_id_ = 2*np.ones(len(data_s_s2s_cid_)).astype(int)
data_s_s2s_pid_ = -1 * np.ones(len(data_s_s2s_cid_)).astype(int)
all_s_feat_ = np.concatenate((data_s_df_feat_,data_s_s2s_feat_))
all_s_pid_ = np.concatenate((data_s_df_pid_,data_s_s2s_pid_))
all_s_cid_ = np.concatenate((data_s_df_cid_,data_s_s2s_cid_))
all_s_dataset_id_ = np.concatenate((data_s_df_dataset_id_,data_s_s2s_dataset_id_))
# all_indexes = np.asarray(range(len(all_s_dataset_id_)))
del data_s_df_, data_s_df_feat_, data_s_df_pid_, data_s_df_cid_, data_s_df_dataset_id_
del data_s_s2s_, data_s_s2s_feat_, data_s_s2s_pid_, data_s_s2s_cid_, data_s_s2s_dataset_id_
# S2S within category
W_ = W_within_catergory(all_s_feat_, all_s_cid_)
# FW (DF-S2S) within category
FW_ = FW_within_catergory(all_s_feat_, all_s_cid_)
########
#
# Consumer Samples
#
########
all_c_df_ = h5py.File(os.path.join(path_to_data_df_,'train_Consumer_X_{}.h5'.format(feat_epoch_num_)),'r')
# data_c_df_feat_ = np.asarray(all_c_df_['X_'])
all_c_pid_ = np.asarray(all_c_df_['pid']).astype(int)
all_c_cid_ = np.asarray(all_c_df_['cid']).astype(int)
all_c_idx_ = np.asarray(range(len(all_c_cid_)))
return W_, FW_, all_s_dataset_id_, all_c_pid_, all_c_cid_, all_s_pid_, all_s_cid_
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Expert Matcher')
parser.add_argument('--optimizer', type=str, default='ADAM',
help='ADM | SGD')
parser.add_argument('--comb', type=str, default='L123',
help='L1 | L12 | L123')
parser.add_argument('--finch-part', type=int, default=0)
args = parser.parse_args()
#############################
#
# Setting default parameters
#
#############################
# Device configuration
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
combinations_type = args.comb # L1, L12, L123
finch_partition_number_ = args.finch_part # 0 -> first parition, 1 --> second partition
#############################
#
# Generate weak labels for the first time
#
#############################
path_to_BASE_data_ = 'features/ClusteringFeats/BASE'
model_name = 'DeepFashion_ADAM_ALL'
feat_epoch_num_ = 60
feat_type_ = 'X'
path_to_data_df_ = os.path.join(path_to_BASE_data_,'DeepFashion',model_name)
path_to_data_s2s_ = os.path.join(path_to_BASE_data_,'Street2Shop',model_name)
output_finch_pred_path_ = 'features/finch_partitions'
output_finch_pred_filename_npz = os.path.join(output_finch_pred_path_,model_name,'L1_L2_L3_finch_base_X_{}_Finch_{}.npz'.format(feat_epoch_num_,finch_partition_number_))
output_finch_pred_filename_pkl = os.path.join(output_finch_pred_path_,model_name,'L1_L2_L3_finch_base_X_{}_Finch_{}.pkl'.format(feat_epoch_num_,finch_partition_number_))
if not os.path.isfile(output_finch_pred_filename_pkl):
# # Clustering
W_, FW_, all_s_dataset_id_, all_c_pid_, all_c_cid_, all_s_pid_, all_s_cid_ = cluster_X_base(path_to_data_df_, path_to_data_s2s_, feat_epoch_num_, feat_type_)
# # Indexing
F_FW_W_ = data_prep_per_category(all_c_pid_, all_c_cid_, all_s_pid_, all_s_cid_, FW_, W_, finch_partition_number_)
Fc_Fs = data_prep_all_category(all_c_pid_, all_s_pid_)
np.savez(output_finch_pred_filename_npz, W_=W_, FW_=FW_, all_s_dataset_id_=all_s_dataset_id_, all_c_pid_=all_c_pid_, all_c_cid_=all_c_cid_, all_s_pid_=all_s_pid_, all_s_cid_=all_s_cid_, Fc_Fs=Fc_Fs)
with open(output_finch_pred_filename_pkl, 'wb') as f: pickle.dump(F_FW_W_, f, pickle.HIGHEST_PROTOCOL)