bestow136 commited on
Commit
8ffcfd0
·
verified ·
1 Parent(s): ed67ff4

Upload 13 files

Browse files
Files changed (14) hide show
  1. .gitattributes +2 -0
  2. 3Sources.mat +3 -0
  3. Cluster.py +284 -0
  4. Datasets.py +39 -0
  5. Prokaryotic.mat +3 -0
  6. SRW_KNN_greedy.py +219 -0
  7. anchors.py +22 -0
  8. config.py +85 -0
  9. data_loader.py +37 -0
  10. model.py +181 -0
  11. run.py +310 -0
  12. sample_kernal.py +44 -0
  13. train_methods.py +73 -0
  14. utils.py +80 -0
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ 3Sources.mat filter=lfs diff=lfs merge=lfs -text
37
+ Prokaryotic.mat filter=lfs diff=lfs merge=lfs -text
3Sources.mat ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0b56f4c3441fbaa0dc0281897851722c122bb00b24abdef15ff7e4c88dace833
3
+ size 112113
Cluster.py ADDED
@@ -0,0 +1,284 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import numpy as np
3
+ import torch.nn
4
+ from utils import cosineSimilartydis
5
+ from sklearn import metrics
6
+ import sklearn.metrics as metrics
7
+ from sklearn.cluster import KMeans
8
+ from munkres import Munkres
9
+ import sys
10
+ import logging
11
+ from sample_kernal import *
12
+ def tiny_infer(model, device, all_data, all_label_X, all_label_Y):
13
+ model.eval()
14
+ align_out0 = []
15
+ align_out1 = []
16
+ sort_value=[]
17
+ class_labels_cluster = []
18
+ len_alldata0 = all_data[0].shape[1]
19
+ len_alldata1 = all_data[1].shape[1]
20
+ # print(len_alldata0)
21
+ # print(len_alldata1)
22
+ len_map=max(len_alldata0, len_alldata1)
23
+ align_labels = torch.zeros(len_map)
24
+ if len_alldata0 > len_alldata1:
25
+ labels = all_label_Y
26
+ long_labels=all_label_X
27
+ test_num = len_alldata1
28
+ long_num= len_alldata0
29
+ else:
30
+ labels = all_label_X
31
+ long_labels = all_label_Y
32
+ test_num = len_alldata0
33
+ long_num = len_alldata1
34
+ labels = torch.from_numpy(labels)
35
+ with torch.no_grad():
36
+ x0, x1, labels = all_data[0].to(device), all_data[1].to(device), labels.to(device)
37
+ x0 = x0.view(x0.size()[0], -1).T
38
+ x1 = x1.view(x1.size()[0], -1).T
39
+ h0, h1 = model(x0, x1)
40
+ if len_alldata0 > len_alldata1:
41
+ C = cosineSimilartydis(h0, h1).T
42
+ C_temp=C.clone()
43
+ for i in range(test_num):
44
+ idx = torch.argsort(C[i, :])
45
+ sort_value.append(C_temp[i, idx[0]])
46
+ C[:, idx[0]] = float("inf")
47
+ align_out0.append((h1[i, :].cpu()).numpy())
48
+ align_out1.append((h0[idx[0], :].cpu()).numpy())#它和align0维度一样变小了
49
+ # if all_label_Y[i] == all_label_X[idx[0]]:
50
+ # align_labels[i] = 1
51
+ else:
52
+ C = cosineSimilartydis(h0, h1)
53
+ C_temp = C.clone()
54
+ for i in range(test_num):
55
+ idx = torch.argsort(C[i, :])
56
+ sort_value.append(C_temp[i, idx[0]])
57
+ C[:, idx[0]] = float("inf")
58
+ align_out0.append((h0[i, :].cpu()).numpy())
59
+ align_out1.append((h1[idx[0], :].cpu()).numpy())
60
+ # if all_label_X[i] == all_label_Y[idx[0]]:
61
+ # align_labels[i] = 1
62
+
63
+ sort_value,align_out0=torch.tensor(sort_value),torch.tensor(align_out0)
64
+ sorted_list, sorted_indice0 = torch.sort(sort_value)
65
+ sorted_indice0 = sorted_indice0.to(torch.long)
66
+
67
+ # 使用排序后的索引重新排列 align_out0
68
+ sorted_align0 = align_out0[sorted_indice0]
69
+ # 计算相邻元素之间的差值
70
+ differences = sorted_list[1:] - sorted_list[:-1]
71
+ x_known = np.arange(len(sorted_list))
72
+ # 生成索引
73
+ index_pairs = [(i, i + 1) for i in range(len(sorted_list) - 1)]
74
+ Xn=long_num-test_num
75
+ top_values, top_indices = torch.topk(differences, Xn)
76
+ # 使用 top_indices 获取对应的索引对
77
+ top_index_pairs = [index_pairs[i.item()] for i in top_indices]
78
+ average_indices = [(i + j) / 2 for i, j in top_index_pairs]
79
+ average_indices=np.array(average_indices)
80
+ bandwidth=1.0
81
+ index_pairs=np.array(index_pairs)
82
+ sorted_align0=np.array(sorted_align0)
83
+
84
+ A3_initial = kernel_regression_multi_dim(x_known, sorted_align0, average_indices, bandwidth)
85
+ x_known_sorted, y_sorted_align0 = insert_and_sort(x_known, sorted_align0, average_indices, A3_initial)
86
+ alignre0,alignre1=[],[]
87
+ y_sorted_align0=torch.tensor(y_sorted_align0).to('cuda')
88
+ y_sorted_align0=y_sorted_align0.float()
89
+ if len_alldata0 > len_alldata1:
90
+ Cre = cosineSimilartydis(h0,y_sorted_align0)
91
+ for i in range(long_num):
92
+ idx0 = torch.argsort(Cre[i, :])
93
+ Cre[:, idx0[0]] = float("inf")
94
+ alignre0.append((h0[i, :].cpu()).numpy())
95
+ alignre1.append((y_sorted_align0[idx[0], :].cpu()).numpy())
96
+ if all_label_X[i] == all_label_Y[idx[0]]:
97
+ align_labels[i] = 1
98
+ else:
99
+ Cre = cosineSimilartydis(h1,y_sorted_align0)
100
+ for i in range(long_num):
101
+ idx1 = torch.argsort(Cre[i, :])
102
+ Cre[:, idx1[0]] = float("inf")
103
+ alignre0.append((h1[i, :].cpu()).numpy())
104
+ alignre1.append((y_sorted_align0[idx[0], :].cpu()).numpy())
105
+ if all_label_Y[i] == all_label_X[idx[0]]:
106
+ align_labels[i] = 1
107
+
108
+ class_labels_cluster.extend(labels.cpu().numpy())
109
+ #
110
+ count = torch.sum(align_labels)
111
+ # print(test_num,'testnum')
112
+ inference_acc = count.item() / test_num
113
+ print(inference_acc)
114
+ print(np.shape(align_out1))
115
+ return np.array(alignre0), np.array(alignre1), np.array(class_labels_cluster), inference_acc
116
+ # return np.array(align_out0), np.array(align_out1), np.array(class_labels_cluster), inference_acc
117
+ def Clustering(x_list, y):
118
+ # logging.info('******** Clustering ********')
119
+ n_clusters = np.size(np.unique(y))
120
+
121
+ # np.random.seed(1)
122
+
123
+ x_final_concat = np.concatenate(x_list[:], axis=1)
124
+ kmeans_assignments, km = get_cluster_sols(x_final_concat, ClusterClass=KMeans, n_clusters=n_clusters,
125
+ init_args={'n_init': 10})
126
+ y_preds = get_y_preds(y, kmeans_assignments, n_clusters)
127
+ if np.min(y) == 1:
128
+ y = y - 1
129
+ scores, _ ,accuracy,nmi,ari,f_score,f_score2,precision,precision2,recall,purity= clustering_metric(y, kmeans_assignments, n_clusters)
130
+
131
+ ret = {}
132
+ ret['kmeans'] = scores
133
+ return y_preds, ret,accuracy,nmi,ari,f_score,f_score2,precision,precision2,recall,purity
134
+
135
+ def get_y_preds(y_true, cluster_assignments, n_clusters):
136
+ '''
137
+ Computes the predicted labels, where label assignments now
138
+ correspond to the actual labels in y_true (as estimated by Munkres)
139
+
140
+ cluster_assignments: array of labels, outputted by kmeans
141
+ y_true: true labels
142
+ n_clusters: number of clusters in the dataset
143
+
144
+ returns: a tuple containing the accuracy and confusion matrix,
145
+ in that order
146
+ '''
147
+ confusion_matrix = metrics.confusion_matrix(y_true, cluster_assignments, labels=None)
148
+ # compute accuracy based on optimal 1:1 assignment of clusters to labels
149
+ cost_matrix = calculate_cost_matrix(confusion_matrix, n_clusters)
150
+ indices = Munkres().compute(cost_matrix)
151
+ kmeans_to_true_cluster_labels = get_cluster_labels_from_indices(indices)
152
+
153
+ if np.min(cluster_assignments) != 0:
154
+ cluster_assignments = cluster_assignments - np.min(cluster_assignments)
155
+ y_pred = kmeans_to_true_cluster_labels[cluster_assignments]
156
+ return y_pred
157
+
158
+ def get_cluster_sols(x, cluster_obj=None, ClusterClass=None, n_clusters=None, init_args={}):
159
+ '''
160
+ Using either a newly instantiated ClusterClass or a provided
161
+ cluster_obj, generates cluster assignments based on input data
162
+
163
+ x: the points with which to perform clustering
164
+ cluster_obj: a pre-fitted instance of a clustering class
165
+ ClusterClass: a reference to the sklearn clustering class, necessary
166
+ if instantiating a new clustering class
167
+ n_clusters: number of clusters in the dataset, necessary
168
+ if instantiating new clustering class
169
+ init_args: any initialization arguments passed to ClusterClass
170
+
171
+ returns: a tuple containing the label assignments and the clustering object
172
+ '''
173
+ # if provided_cluster_obj is None, we must have both ClusterClass and n_clusters
174
+ assert not (cluster_obj is None and (ClusterClass is None or n_clusters is None))
175
+ cluster_assignments = None
176
+ if cluster_obj is None:
177
+ cluster_obj = ClusterClass(n_clusters, **init_args)
178
+ for _ in range(10):
179
+ try:
180
+ cluster_obj.fit(x)
181
+ break
182
+ except:
183
+ print("Unexpected error:", sys.exc_info())
184
+ else:
185
+ return np.zeros((len(x),)), cluster_obj
186
+
187
+ cluster_assignments = cluster_obj.predict(x)
188
+ return cluster_assignments, cluster_obj
189
+
190
+ def calculate_cost_matrix(C, n_clusters):
191
+ cost_matrix = np.zeros((n_clusters, n_clusters))
192
+
193
+ # cost_matrix[i,j] will be the cost of assigning cluster i to label j
194
+ for j in range(n_clusters):
195
+ s = np.sum(C[:, j]) # number of examples in cluster i
196
+ for i in range(n_clusters):
197
+ t = C[i, j]
198
+ cost_matrix[j, i] = s - t
199
+ return cost_matrix
200
+
201
+
202
+ def get_cluster_labels_from_indices(indices):
203
+ n_clusters = len(indices)
204
+ clusterLabels = np.zeros(n_clusters)
205
+ for i in range(n_clusters):
206
+ clusterLabels[i] = indices[i][1]
207
+ return clusterLabels
208
+
209
+ def clustering_metric(y_true, y_pred, n_clusters, verbose=False, decimals=4):
210
+ y_pred_ajusted = get_y_preds(y_true, y_pred, n_clusters)
211
+
212
+ classification_metrics, confusion_matrix = classification_metric(y_true, y_pred_ajusted)
213
+ accuracy = metrics.accuracy_score(y_true, y_pred_ajusted)
214
+ accuracy = np.round(accuracy, decimals)
215
+ # AMI
216
+ ami = metrics.adjusted_mutual_info_score(y_true, y_pred_ajusted)
217
+ ami = np.round(ami, decimals)
218
+ # NMI
219
+ nmi = metrics.normalized_mutual_info_score(y_true, y_pred_ajusted)
220
+ nmi = np.round(nmi, decimals)
221
+ # ARI
222
+ ari = metrics.adjusted_rand_score(y_true, y_pred_ajusted)
223
+ ari = np.round(ari, decimals)
224
+ #fscore
225
+ f_score = metrics.f1_score(y_true, y_pred_ajusted, average='macro')
226
+ f_score = np.round(f_score, decimals)
227
+ f_score2 = metrics.f1_score(y_true, y_pred_ajusted, average='weighted')
228
+ f_score2 = np.round(f_score2, decimals)
229
+ # precision
230
+ precision = metrics.precision_score(y_true, y_pred_ajusted, average='macro')
231
+ precision = np.round(precision, decimals)
232
+ precision2 = metrics.precision_score(y_true, y_pred_ajusted, average='weighted')
233
+ precision2 = np.round(precision2, decimals)
234
+ # recall
235
+ recall = metrics.recall_score(y_true, y_pred_ajusted, average='macro')
236
+ recall = np.round(recall, decimals)
237
+ # Purity
238
+ purity = Purity(y_true, y_pred_ajusted)
239
+ purity = np.round(purity, decimals)
240
+ # print(accuracy,nmi,ari,f_score,f_score2,precision,precision2,recall,purity,"zb")
241
+ # if verbose:
242
+ # logging.info('AMI: {}, NMI: {}, ARI: {}'.format(ami, nmi, ari))
243
+ # return dict({'AMI': ami, 'NMI': nmi, 'ARI': ari}, **classification_metrics), confusion_matrix,accuracy,nmi,ari,f_score,f_score2,precision,precision2,recall,purity
244
+ return dict({'ACC': accuracy,'AMI': ami, 'NMI': nmi, 'ARI': ari, 'F1': f_score, 'F2': f_score2, 'PRE': precision, 'PRE2': precision2, 'REC': recall, 'PUR': purity}), confusion_matrix, accuracy, nmi, ari, f_score, f_score2, precision, precision2, recall, purity
245
+ def Purity(y_true, y_pred):
246
+ y_voted_labels = np.zeros(y_true.shape)
247
+ labels = np.unique(y_true)
248
+ ordered_labels = np.arange(labels.shape[0])
249
+ for k in range(labels.shape[0]):
250
+ y_true[y_true == labels[k]] = ordered_labels[k]
251
+ labels = np.unique(y_true)
252
+ bins = np.concatenate((labels, [np.max(labels) + 1]), axis=0)
253
+
254
+ for cluster in np.unique(y_pred):
255
+ hist, _ = np.histogram(y_true[y_pred == cluster], bins=bins)
256
+ winner = np.argmax(hist)
257
+ y_voted_labels[y_pred == cluster] = winner
258
+
259
+ return metrics.accuracy_score(y_true, y_voted_labels)
260
+
261
+ def classification_metric(y_true, y_pred, average='macro', verbose=False, decimals=4):
262
+ # confusion matrix
263
+ confusion_matrix = metrics.confusion_matrix(y_true, y_pred)
264
+ # ACC
265
+ accuracy = metrics.accuracy_score(y_true, y_pred)
266
+ accuracy = np.round(accuracy, decimals)
267
+
268
+ # precision
269
+ precision = metrics.precision_score(y_true, y_pred, average=average)
270
+ precision = np.round(precision, decimals)
271
+
272
+ # recall
273
+ recall = metrics.recall_score(y_true, y_pred, average=average)
274
+ recall = np.round(recall, decimals)
275
+
276
+ # F-score
277
+ f_score = metrics.f1_score(y_true, y_pred, average=average)
278
+ f_score = np.round(f_score, decimals)
279
+
280
+ if verbose:
281
+ # print('Confusion Matrix')
282
+ # print(confusion_matrix)
283
+ logging.info('accuracy: {}, precision: {}, recall: {}, f_measure: {}'.format(accuracy, precision, recall, f_score))
284
+ return {'accuracy': accuracy, 'precision': precision, 'recall': recall, 'f_measure': f_score}, confusion_matrix
Datasets.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from torch.utils.data import Dataset, DataLoader
2
+ import numpy as np
3
+ import torch
4
+ class GetDataset(Dataset):
5
+ def __init__(self, data, labels, real_labels):
6
+ self.data = data
7
+ self.labels = labels
8
+ self.real_labels = real_labels
9
+
10
+ def __getitem__(self, index):
11
+ fea0, fea1 = torch.from_numpy(self.data[0][:, index]).float(), torch.from_numpy(self.data[1][:, index]).float()
12
+ fea0, fea1 = fea0.unsqueeze(0), fea1.unsqueeze(0)
13
+ label = np.int64(self.labels[index])
14
+ if len(self.real_labels) == 0:
15
+ return fea0, fea1, label
16
+ real_label = np.int64(self.real_labels[index])
17
+ return fea0, fea1, label, real_label
18
+
19
+ def __len__(self):
20
+ return len(self.labels)
21
+
22
+
23
+ class GetAllDataset(Dataset):
24
+ def __init__(self, data, labels, class_labels0, class_labels1):
25
+ self.data = data
26
+ self.labels = labels
27
+ self.class_labels0 = class_labels0
28
+ self.class_labels1 = class_labels1
29
+
30
+ def __getitem__(self, index):
31
+ fea0, fea1 = torch.from_numpy(self.data[0][:, index]).float(), torch.from_numpy(self.data[1][:, index]).float()
32
+ fea0, fea1 = fea0.unsqueeze(0), fea1.unsqueeze(0)
33
+ label = np.int64(self.labels[index])
34
+ class_labels0 = np.int64(self.class_labels0[index])
35
+ class_labels1 = np.int64(self.class_labels1[index])
36
+ return fea0, fea1, label, class_labels0, class_labels1
37
+
38
+ def __len__(self):
39
+ return len(self.labels)
Prokaryotic.mat ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:20471598a8c819fb35e94f102cda4300a3b4f2cd185d3cbe0ea06e0349a7ed7c
3
+ size 3105301
SRW_KNN_greedy.py ADDED
@@ -0,0 +1,219 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import torch
3
+ import umap
4
+ import matplotlib.pyplot as plt
5
+ from run import *
6
+ # 固定随机数种子,生成100个二维数据点
7
+ # torch.manual_seed(99)
8
+ def para(data,num_nodes,num_class):
9
+ similarity_threshold = 0.4 # 相似度阈值
10
+ num_anchors = num_class*2# 锚点数量
11
+ # num_anchors =26
12
+ distances = cosineSimilartydis(data, data)
13
+ # 排除对角线上的自身距离(0)的平均值
14
+ mean_distance = distances[~torch.eye(distances.size(0), dtype=torch.bool)].mean()
15
+ coverage_radius=mean_distance*0.3 # 贪心覆盖算法中的覆盖半径
16
+ #到时候写一个对齐数据少于锚点数量error的提示
17
+ if num_nodes < 100: # 小图
18
+ num_walks,walk_length = 20,3
19
+ elif num_nodes < 1000: # 中型图
20
+ num_walks,walk_length = 10,5
21
+ elif num_nodes < 10000: # 大型图
22
+ num_walks,walk_length = 5,10
23
+ else: # 超大图
24
+ num_walks,walk_length = 3,20
25
+ return num_walks,walk_length,similarity_threshold,num_anchors,coverage_radius
26
+
27
+
28
+
29
+
30
+ def cosineSimilarty(A,B):
31
+ A=A/(torch.norm(A,dim=1,p=2,keepdim=True)+0.000001)
32
+ # A2 = A / (torch.norm(A, dim=0, p=2, keepdim=True) + 0.000001)
33
+ B=B/(torch.norm(B,dim=1,p=2,keepdim=True)+0.000001)
34
+ # B2 = B / (torch.norm(B, dim=0, p=2, keepdim=True) + 0.000001)
35
+
36
+
37
+ W=torch.mm(A,B.t())
38
+ max_values,_ = torch.max(W, axis=0)
39
+ min_values,_ = torch.min(W, axis=0)
40
+ normalized_matrix = (W - min_values) / (max_values - min_values)
41
+ normalized_matrix = torch.nan_to_num(normalized_matrix, nan=0.0001)
42
+ return normalized_matrix
43
+
44
+ def cosineSimilartydis(A,B):
45
+ A=A/(torch.norm(A,dim=1,p=2,keepdim=True)+0.000001)
46
+ B=B/(torch.norm(B,dim=1,p=2,keepdim=True)+0.000001)
47
+
48
+ W=torch.mm(A,B.t())
49
+ max_values, _ = torch.max(W, axis=0)
50
+ min_values, _ = torch.min(W, axis=0)
51
+ normalized_matrix = (W - min_values) / (max_values - min_values)
52
+ normalized_matrix = torch.nan_to_num(normalized_matrix, nan=0.0001)
53
+ return 1-normalized_matrix
54
+ # # 随机游走参数
55
+ #
56
+ #
57
+ # Step 1: 初始化完全图的转移概率矩阵
58
+ # distances = torch.cdist(data, data, p=2) # 计算所有点之间的欧几里得距离
59
+ # adj_matrix = torch.exp(-distances) # 高斯权重:距离越小权重越高
60
+ # def visit(data):
61
+ # adj_matrix=cosineSimilarty(data,data)
62
+ # print(np.shape(adj_matrix))
63
+ # adj_matrix.fill_diagonal_(0) # 去掉自身连接
64
+ # transition_matrix = adj_matrix / adj_matrix.sum(dim=1, keepdim=True) # 归一化为转移概率
65
+ # return transition_matrix
66
+
67
+
68
+ def visit(data, alpha=0.5):
69
+ """
70
+ 根据给定的节点特征矩阵data和参数alpha计算转移矩阵。
71
+ 使用余弦相似度矩阵作为转移的相似度度量。
72
+ 计算公式:r_mu(x_i) = (x_i / mu_i) ^ -alpha
73
+ """
74
+ num_nodes = data.size(0)
75
+
76
+ # 计算节点间的余弦相似度矩阵
77
+ adj_matrix = cosineSimilarty(data, data)
78
+
79
+ # 归一化每一行,确保每行相似度和为1
80
+ adj_matrix.fill_diagonal_(0) # 去掉自身连接
81
+ adj_matrix = torch.nan_to_num(adj_matrix, nan=0.0001) # 防止NaN值
82
+
83
+ # 归一化为转移概率,确保每行的和为1
84
+ row_sums = adj_matrix.sum(dim=1, keepdim=True) + 0.000001 # 防止除以零
85
+ adj_matrix = adj_matrix / row_sums # 归一化为转移概率
86
+
87
+ # 防止出现概率为零的行(所有相似度为零时)
88
+ adj_matrix = torch.nan_to_num(adj_matrix, nan=0.0001) # 替换NaN为小值
89
+ adj_matrix = torch.clamp(adj_matrix, min=0.0001) # 防止小于0的概率值
90
+
91
+ # 根据 alpha 修改相似度矩阵
92
+ transition_matrix = adj_matrix ** (-alpha) # 应用公式 r_mu(x_i) = (x_i / mu_i) ^ -alpha
93
+
94
+ # 再次归一化转移矩阵,使得每行的和为1
95
+ transition_matrix = transition_matrix / (transition_matrix.sum(dim=1, keepdim=True) + 0.000001)
96
+
97
+ # 检查是否有行的和仍然为0,若有则设置为均匀分布
98
+ zero_rows = (transition_matrix.sum(dim=1) == 0)
99
+ if zero_rows.any():
100
+ transition_matrix[zero_rows] = 1.0 / num_nodes # 对于零行,设置均匀分布
101
+
102
+ return transition_matrix
103
+
104
+
105
+ # 优化方法
106
+ def random_walk_batch_paths(transition_matrix, num_walks, walk_length):
107
+ """
108
+ 批量化生成随机游走路径,并统计访问频次。
109
+ """
110
+ num_nodes = transition_matrix.size(0)
111
+ visit_matrix = torch.zeros_like(transition_matrix,device='cuda') # 初始化访问频率矩阵
112
+ for start_node in range(num_nodes): # 遍历每个起始节点
113
+ # 初始化起点
114
+ paths = torch.full((num_walks, walk_length + 1), start_node, dtype=torch.long,device='cuda') # 每行一条路径
115
+ for step in range(walk_length): # 生成完整路径
116
+
117
+ probs = transition_matrix[paths[:, step]] # 当前步节点的转移概率
118
+ next_nodes = torch.multinomial(probs, 1).squeeze() # 采样下一个节点
119
+ paths[:, step + 1] = next_nodes
120
+
121
+ # 累计所有路径的访问频率
122
+ for path in paths:
123
+ visit_matrix[start_node].index_add_(0, path, torch.ones_like(path, dtype=torch.float,device='cuda'))
124
+ visit_matrix -= torch.diag(torch.full((num_nodes,), num_walks, dtype=visit_matrix.dtype,device='cuda'))
125
+ return visit_matrix
126
+
127
+
128
+ # visit_matrix = random_walk_batch_paths(transition_matrix, num_walks, walk_length)
129
+ #
130
+ # # visit_matrix = random_walk_parallel(transition_matrix, num_walks, walk_length)
131
+ #
132
+ # Step 3: 归一化访问频率为相似度,构建基于阈值的 kNN 图
133
+ def thresholded_knn(visit_matrix,similarity_threshold):
134
+ similarity_matrix = visit_matrix / visit_matrix.max()
135
+ thresholded_adj = (similarity_matrix > similarity_threshold).float() # 保留相似度大于阈值的边
136
+ return thresholded_adj
137
+ # # Step 5: 贪心覆盖算法选择锚点
138
+ def greedy_cover_with_importance(data, importance_scores, r, num_anchors):
139
+ """
140
+ 贪心覆盖算法用于选择锚点
141
+ :param data: 数据点,形状为 (n_samples, n_features)
142
+ :param importance_scores: 每个点的重要性分数 (随机游走访问频率)
143
+ :param r: 覆盖半径
144
+ :param num_anchors: 需要选择的锚点数量
145
+ :return: 锚点索引
146
+ """
147
+ distances = cosineSimilartydis(data,data) # 计算点对点距离
148
+ selected = [] # 选择的锚点索引
149
+ covered = torch.zeros(data.size(0), dtype=torch.bool,device='cuda') # 覆盖标志位
150
+ sorted_indices = torch.argsort(importance_scores, descending=True) # 按重要性排序
151
+ cluster_selected = torch.zeros(data.size(0), dtype=torch.bool, device='cuda') # 集群是否被选中锚点标记
152
+
153
+ while len(selected) < num_anchors:
154
+ # prev_covered_sum = covered.sum().item() # 上一次覆盖点的数量
155
+
156
+ for idx in sorted_indices:
157
+ if len(selected) >= num_anchors:
158
+ break
159
+ if not covered[idx] and not cluster_selected[idx]: # 如果当前点未被覆盖,且所属集群未选过锚点
160
+ selected.append(idx) # 选择锚点
161
+
162
+ cluster_selected[idx] = 1 # 标记所属集群已选锚点
163
+ # 将当前锚点覆盖范围内的点标记为已覆盖
164
+ covered |= distances[idx] <= r
165
+ covered[idx] = 1#调了半天,锚点自己没有被覆盖
166
+ selected_anchors = set(selected) # 当前已选择的锚点集合
167
+ selected_anchors_tensor = torch.tensor(list(selected_anchors), device='cuda')
168
+ # 检查是否所有集群都已被选过锚点
169
+ if covered.sum().item() == data.size(0):
170
+ print("所有点已被覆盖,重置覆盖状态")
171
+ # 记录已选的锚点,重置覆盖标志
172
+ covered[:] = 0
173
+ covered[selected_anchors_tensor] = 1 # 恢复已选锚点的覆盖状态
174
+ print(len(selected))
175
+ # elif covered.sum().item() == prev_covered_sum:
176
+ # print("没有新的点被覆盖,终止选择锚点")
177
+ # break # 如果没有新点被覆盖,跳出循环
178
+ return torch.tensor(selected,device='cuda')
179
+
180
+ # 计算节点的重要性(访问频率的总和)
181
+ # node_importance = visit_matrix.sum(dim=1)
182
+ #
183
+ # # 使用贪心覆盖算法选择锚点
184
+ # anchor_indices = greedy_cover_with_importance(data, node_importance, coverage_radius, num_anchors)
185
+ # anchors = data[anchor_indices] # 提取锚点
186
+
187
+ # # Step 6: 可视化结果
188
+ # # from sklearn.decomposition import PCA
189
+ # # import matplotlib.pyplot as plt
190
+ # #
191
+ # # # 假设 data 和 anchors 是 5维张量
192
+ # # pca = PCA(n_components=2, random_state=42)
193
+ # #
194
+ # # # 降维到 2D
195
+ # # data_2d = pca.fit_transform(data.detach().cpu().numpy())
196
+ # # anchors_2d = pca.transform(anchors.detach().cpu().numpy())
197
+ # #
198
+ # # # 绘制统一显示的散点图
199
+ # # plt.figure(figsize=(8, 8))
200
+ # # plt.scatter(data_2d[:, 0], data_2d[:, 1], c='blue', label="Data Points", alpha=0.5, s=30)
201
+ # # plt.scatter(anchors_2d[:, 0], anchors_2d[:, 1], color="red", label="Anchor Points", s=100, edgecolor='black')
202
+ # # plt.title("Unified Visualization with PCA")
203
+ # # plt.legend()
204
+ # # plt.show()
205
+ #
206
+ #
207
+ #
208
+ # # 使用 UMAP 降维
209
+ # reducer = umap.UMAP(n_components=2)
210
+ # data_2d = reducer.fit_transform(data.detach().cpu().numpy())
211
+ # anchors_2d = reducer.transform(anchors.detach().cpu().numpy())
212
+ #
213
+ # # 绘制统一显示图
214
+ # plt.figure(figsize=(8, 8))
215
+ # plt.scatter(data_2d[:, 0], data_2d[:, 1], c='blue', label="Data Points", alpha=0.5, s=30)
216
+ # plt.scatter(anchors_2d[:, 0], anchors_2d[:, 1], color="red", label="Anchor Points", s=20, edgecolor='black')
217
+ # plt.title("Unified Visualization with UMAP")
218
+ # plt.legend()
219
+ # plt.show()
anchors.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from SRW_KNN_greedy import *
2
+ import torch
3
+ def get_anchors(h0,h1,map_pairs,num_unique_labels):
4
+ #又一个点,提前预训练,统一样本的潜在空间
5
+ # print(h0.shape[0],num_unique_labels,'ghjhggjf')
6
+ #初始化随机游走参数
7
+ num_walks0, walk_length0, similarity_threshold0, num_anchors0, coverage_radius0 = para(h0,h0.shape[0],num_unique_labels)
8
+ num_walks1, walk_length1, similarity_threshold1, num_anchors1, coverage_radius1 = para(h1, h1.shape[0],num_unique_labels)
9
+ transition_matrix0,transition_matrix1 = visit(h0),visit(h1)#转移概率矩阵
10
+ #访问矩阵
11
+ visit_matrix0,visit_matrix1 = random_walk_batch_paths(transition_matrix0, num_walks0, walk_length0), random_walk_batch_paths(transition_matrix1, num_walks1, walk_length1)
12
+ #
13
+ node_importance0, node_importance1 = visit_matrix0.sum(dim=0),visit_matrix1.sum(dim=0)
14
+ # # 使用贪心覆盖算法选择锚点
15
+ anchor_indices0 = greedy_cover_with_importance(h0, node_importance0, coverage_radius0, num_anchors0)
16
+ anchor_indices1 = greedy_cover_with_importance(h1, node_importance1, coverage_radius1, num_anchors1)
17
+ combined_indices = torch.cat((anchor_indices0, anchor_indices1))
18
+ unique_indices = torch.unique(combined_indices)#合并索引去重
19
+ len_indices=len(unique_indices)
20
+ mapdata0,mapdata1=torch.tensor(map_pairs[0]),torch.tensor(map_pairs[1])
21
+ anchors0,anchors1 = mapdata0[unique_indices].float(),mapdata1[unique_indices].float()# 提取锚点(降维前)
22
+ return anchors0,anchors1,len_indices
config.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from easydict import EasyDict
2
+ config = EasyDict()
3
+ from run import dim
4
+ '''3Sources'''
5
+ config.input_features1 =3560
6
+ config.input_features2 =3631
7
+ config.enhidden_features = [2000, 320, 50,6]
8
+ config.dehidden_features1 = [50, 320, 2000,3560]
9
+ config.dehidden_features2 = [50, 320, 2000,3631]
10
+ config.classes = 6
11
+ '''BBCsports'''
12
+ # config.input_features1 =2582
13
+ # config.input_features2 =2544
14
+ # config.enhidden_features = [1500, 200, 50,5]
15
+ # config.dehidden_features1 = [50, 200, 1500,2582]
16
+ # config.dehidden_features2 = [50, 200, 1500,2544]
17
+ # config.classes = 5
18
+ '''Caltech101'''
19
+ # config.input_features1 =1984
20
+ # config.input_features2 =512
21
+ # config.enhidden_features = [500, 320, 50,10]
22
+ # config.dehidden_features1 = [50, 320, 500,1984]
23
+ # config.dehidden_features2 = [50, 320, 500,512]
24
+ # config.classes = 20
25
+ '''ORL_mtv'''
26
+ # config.input_features1 =400
27
+ # config.input_features2 =400
28
+ # config.enhidden_features = [300, 150, 50,10]
29
+ # config.dehidden_features1 = [50, 150, 300,400]
30
+ # config.dehidden_features2 = [50, 150, 300,400]
31
+ # config.classes = 40
32
+ '''Caltech101_7'''
33
+ # config.input_features1 =1984
34
+ # config.input_features2 =512
35
+ # config.enhidden_features = [500, 320, 50,5]
36
+ # config.dehidden_features1 = [50, 320, 500,1984]
37
+ # config.dehidden_features2 = [50, 320, 500,512]
38
+ # config.classes = 7
39
+ '''scene15'''
40
+ # config.input_features1 =20
41
+ # config.input_features2 =59
42
+ # config.enhidden_features = [20, 15, 15,10]
43
+ # config.dehidden_features1 = [15, 15, 20,20]
44
+ # config.dehidden_features2 = [15, 15, 20,59]
45
+ # config.classes = 10
46
+ '''Prokaryotic'''
47
+ # config.input_features1 =393
48
+ # config.input_features2 =438
49
+ # config.enhidden_features = [300, 150, 50,10]
50
+ # config.dehidden_features1 = [50, 150, 300,393]
51
+ # config.dehidden_features2 = [50, 150, 300,438]
52
+ # config.classes = 4
53
+ '''yale_mtv'''
54
+ # config.input_features1 =4096
55
+ # config.input_features2 =3304
56
+ # config.enhidden_features = [1500, 200, 50,5]
57
+ # config.dehidden_features1 = [50, 200, 1500,4096]
58
+ # config.dehidden_features2 = [50, 200, 1500,3304]
59
+ # config.classes = 15
60
+ '''flower17'''
61
+ # config.input_features1 =1360
62
+ # config.input_features2 =1360
63
+ # config.enhidden_features = [1000, 200, 50,5]
64
+ # config.dehidden_features1 = [50, 200, 1000,1360]
65
+ # config.dehidden_features2 = [50, 200, 1000,1360]
66
+ # config.classes = 17
67
+ '''100leaves'''
68
+ # config.input_features1 =64
69
+ # config.input_features2 =64
70
+ # config.enhidden_features = [200, 200, 50,10]
71
+ # config.dehidden_features1 = [50, 200, 200,64]
72
+ # config.dehidden_features2 = [50, 200, 200,64]
73
+ # config.classes = 100
74
+
75
+ config.lr = 1e-3
76
+ config.momentum = 0.9#SGD才有的参数,动量通过利用过去梯度的加权平均值来调整当前梯度的方向,避免震荡
77
+ config.weight_decay = 0
78
+ config.w_v = 0
79
+
80
+ config.print_step = 10
81
+ config.tensorboard_step = 100
82
+ config.load_iter = 0
83
+ config.train_iters = 5000
84
+ config.is_train = True
85
+ config.use_cuda = True
data_loader.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import mat73
2
+ import numpy as np
3
+ import scipy.io as sio
4
+ import torch
5
+ import random
6
+ from torch.utils.data import Dataset, DataLoader
7
+ from utils import *
8
+ def get_pairs(E_X, E_Y, neg_prop, train_label):
9
+ view0, view1, labels, real_labels, class_labels0, class_labels1 = [], [], [], [], [], []
10
+ # construct pos. pairs
11
+ for i in range(len(E_X)):
12
+ view0.append(E_X[i])
13
+ view1.append(E_Y[i])
14
+ labels.append(1)
15
+ real_labels.append(1)
16
+ class_labels0.append(train_label[i])
17
+ class_labels1.append(train_label[i])
18
+ # construct neg. pairs by taking each sample in view0 as an anchor and randomly sample neg_prop samples from view1,
19
+ # which may lead to the so called noisy labels, namely, some of the constructed neg. pairs may in the same category.
20
+ for j in range(len(E_X)):
21
+ neg_idx = random.sample(range(len(E_Y)), neg_prop)
22
+ for k in range(neg_prop):
23
+ view0.append(E_X[j])
24
+ view1.append(E_Y[neg_idx[k]])
25
+ labels.append(0)
26
+ class_labels0.append(train_label[j])
27
+ class_labels1.append(train_label[neg_idx[k]])
28
+ if train_label[j] != train_label[neg_idx[k]]:
29
+ real_labels.append(0)
30
+ else:
31
+ real_labels.append(1)
32
+
33
+ labels = np.array(labels, dtype=np.int64)
34
+ real_labels = np.array(real_labels, dtype=np.int64)
35
+ class_labels0, class_labels1 = np.array(class_labels0, dtype=np.int64), np.array(class_labels1, dtype=np.int64)
36
+ view0, view1 = np.array(view0, dtype=np.float32), np.array(view1, dtype=np.float32)
37
+ return view0, view1, labels, real_labels, class_labels0, class_labels1
model.py ADDED
@@ -0,0 +1,181 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch.nn as nn
2
+ from torch import optim
3
+
4
+ class endA(nn.Module):
5
+ def __init__(self, in_features, out_features):
6
+ super(endA, self).__init__()
7
+
8
+ self.encoder = nn.Sequential(
9
+ nn.Linear(in_features, out_features),
10
+ nn.BatchNorm1d(out_features),
11
+ nn.ReLU(True),
12
+ nn.Dropout(0.1)
13
+ ) # 编码
14
+
15
+ #self.decoder[0].weight.data = self.encoder[0].weight.data.transpose(0, 1)
16
+
17
+ def forward(self, x):
18
+ h = self.encoder(x)
19
+ return h
20
+
21
+
22
+ class dedA(nn.Module):
23
+ def __init__(self, out_features, in_features):
24
+ super(dedA, self).__init__()
25
+
26
+ self.decoder = nn.Sequential(
27
+ nn.Linear(out_features, in_features),
28
+ nn.ReLU(True)
29
+ ) # 编码
30
+
31
+ # self.decoder[0].weight.data = self.encoder[0].weight.data.transpose(0, 1)
32
+
33
+ def forward(self, x):
34
+ h = self.decoder(x)
35
+ return h
36
+
37
+ class SdA(nn.Module):
38
+ def __init__(self, config):
39
+ super(SdA, self).__init__()
40
+
41
+ layers1 = []
42
+ layers2 = []
43
+ layersall1=[]
44
+ layersall2 = []
45
+ in_features1 = config.input_features1
46
+
47
+
48
+ for out_features in config.enhidden_features:
49
+ layer1 = endA(in_features1, out_features)
50
+ in_features1 = out_features
51
+ layers1.append(layer1)
52
+
53
+ self.layers1 = nn.Sequential(*layers1) # 就是封装了成了一个
54
+
55
+ in_features=config.enhidden_features[-1]
56
+ for out_features in config.dehidden_features1:
57
+ layer2 = dedA(in_features, out_features)
58
+ in_features = out_features
59
+ layers2.append(layer2)
60
+
61
+ self.layers2=nn.Sequential(*layers2)
62
+
63
+ layersall1.append(self.layers1)
64
+ layersall1.append(self.layers2)
65
+ self.layerll1=nn.Sequential(*layersall1)
66
+
67
+ if config.is_train:
68
+ self.ce_criterion = nn.CrossEntropyLoss()
69
+ self.da_optimizers = []
70
+ for layer1 in self.layers1[:-1]:
71
+ # optimizer = optim.SGD(layer1.parameters(), lr=config.lr,
72
+ # momentum=config.momentum, weight_decay=config.weight_decay) # 优化器可以改一下
73
+ optimizer = optim.Adam(
74
+ layer1.parameters(), lr=0.001, betas=(0.9, 0.99), eps=1e-8, weight_decay=0)
75
+ self.da_optimizers.append(optimizer)
76
+
77
+ layers3 = []
78
+ layers4 = []
79
+ in_features2 = config.input_features2
80
+ for out_features in config.enhidden_features:
81
+ layer3 = endA(in_features2, out_features)
82
+ in_features2 = out_features
83
+ layers3.append(layer3)
84
+
85
+ self.layers3 = nn.Sequential(*layers3) # 就是封装了成了一个
86
+
87
+ in_features=config.enhidden_features[-1]
88
+ for out_features in config.dehidden_features2:
89
+ layer4 = dedA(in_features, out_features)
90
+ in_features = out_features
91
+ layers4.append(layer4)
92
+
93
+ self.layers4=nn.Sequential(*layers4)
94
+
95
+ layersall2.append(self.layers3)
96
+ layersall2.append(self.layers4)
97
+ self.layerll2 = nn.Sequential(*layersall2)
98
+ # for layer in self.layers3:
99
+ # print(layer)
100
+
101
+ if config.is_train:
102
+ self.ce_criterion = nn.CrossEntropyLoss()
103
+ self.da_optimizers = []
104
+ for layer1 in self.layers3[:-1]:
105
+ # optimizer = optim.SGD(layer1.parameters(), lr=config.lr,
106
+ # momentum=config.momentum, weight_decay=config.weight_decay) # 优化器可以改一下
107
+ optimizer=optim.Adam(layer1.parameters(),lr=0.001,betas=(0.9,0.99),eps=1e-8,weight_decay=0)
108
+ self.da_optimizers.append(optimizer)
109
+ # 每一层的优化器
110
+
111
+
112
+ def forward(self, x1, x2):
113
+ h1, h2 = x1, x2
114
+ for layer1 in self.layers1:
115
+ h1 = layer1(h1)
116
+ h3 = h1
117
+ for layer2 in self.layers2:
118
+ h3 = layer2(h3)
119
+ for layer3 in self.layers3:
120
+ h2 = layer3(h2)
121
+ h4=h2
122
+ for layer4 in self.layers4:
123
+ h4 = layer4(h4)
124
+ return h1, h2, h3, h4 # 不是很理解构
125
+
126
+ def regularization_loss(self):
127
+ l2_lambda = 0.001
128
+ l2_norm = sum(p.pow(2).sum() for p in self.parameters())
129
+ return l2_lambda * l2_norm
130
+
131
+
132
+ class Anchormodel(nn.Module):
133
+ def __init__(self,dim,outfeature):
134
+ super(Anchormodel, self).__init__()
135
+ self.encoder0 = nn.Sequential(
136
+ nn.Linear(dim, 1024),
137
+ nn.BatchNorm1d(1024),
138
+ nn.ReLU(True),
139
+ nn.Dropout(0.2),
140
+ nn.Linear(1024, 1024),
141
+ nn.BatchNorm1d(1024),
142
+ nn.ReLU(True),
143
+ nn.Dropout(0.2),
144
+ nn.Linear(1024, 1024),
145
+ nn.BatchNorm1d(1024),
146
+ nn.ReLU(True),
147
+ nn.Dropout(0.2),
148
+ nn.Linear(1024, outfeature),
149
+ nn.BatchNorm1d(outfeature),
150
+ nn.ReLU(True)
151
+ )
152
+ self.encoder1 = nn.Sequential(
153
+ nn.Linear(dim, 1024),
154
+ nn.BatchNorm1d(1024),
155
+ nn.ReLU(True),
156
+ nn.Dropout(0.2),
157
+ nn.Linear(1024, 1024),
158
+ nn.BatchNorm1d(1024),
159
+ nn.ReLU(True),
160
+ nn.Dropout(0.2),
161
+ nn.Linear(1024, 1024),
162
+ nn.BatchNorm1d(1024),
163
+ nn.ReLU(True),
164
+ nn.Dropout(0.2),
165
+ nn.Linear(1024, outfeature),
166
+ nn.BatchNorm1d(outfeature),
167
+ nn.ReLU(True)
168
+ )
169
+
170
+ # self.decoder0 = nn.Sequential(nn.Linear(outfeature, 1024), nn.ReLU(), nn.Dropout(0.2), nn.Linear(1024, 1024), nn.ReLU(),
171
+ # nn.Dropout(0.2), nn.Linear(1024, 1024), nn.ReLU(), nn.Dropout(0.2),
172
+ # nn.Linear(1024, dim))
173
+ # self.decoder1 = nn.Sequential(nn.Linear(outfeature, 1024), nn.ReLU(), nn.Dropout(0.2), nn.Linear(1024, 1024), nn.ReLU(),
174
+ # nn.Dropout(0.2), nn.Linear(1024, 1024), nn.ReLU(), nn.Dropout(0.2),
175
+ # nn.Linear(1024, dim))
176
+ def forward(self, x0, x1):
177
+ h0 = self.encoder0(x0.view(x0.size()[0], -1))
178
+ h1 = self.encoder1(x1.view(x1.size()[0], -1))
179
+ # z0 = self.decoder0(h0)
180
+ # z1 = self.decoder1(h1)
181
+ return h0, h1
run.py ADDED
@@ -0,0 +1,310 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import time
3
+ import random
4
+ from model import *
5
+ import math
6
+ import torch,gc
7
+ import torch.nn as nn
8
+ import torch.nn.functional as F
9
+ from train_methods import *
10
+ import logging
11
+ import sys
12
+ import numpy as np
13
+ import matplotlib.pyplot as plt
14
+ from Datasets import *
15
+ from config import *
16
+ from data_loader import *
17
+ import mat73
18
+ from anchors import *
19
+ from Cluster import *
20
+ parser = argparse.ArgumentParser(description='CAPIMAC in PyTorch')
21
+ parser.add_argument('--data', default='1', type=int,
22
+ help='choice of dataset, 0-HW,1-3Sources,2BBC,3-Scene15, 4-Caltech101,5-ORL_mtv,6-Caltech_7,7-Reuters,'
23
+ '8-20newsgroups,9-100leaves,10-BBC4,11-MSRCv1,12-BDGP,13-HandWritten,14-yale_mtv,15-Wikipedia-test,16-Movies,17-Prokaryotic,18-ALOI,19-flower17')
24
+ parser.add_argument('-bs', '--batch-size', default='1024', type=int, help='number of batch size')
25
+ parser.add_argument('-e', '--epochs', default='200', type=int, help='number of epochs to run')
26
+ parser.add_argument('-lr', '--learn-rate', default='0.0001', type=float, help='learning rate of adam')
27
+ parser.add_argument('-ap', '--aligned-prop', default='0.5', type=float,
28
+ help='originally aligned proportions in the partially view-aligned data')
29
+ parser.add_argument('--gpu', default=0, type=int, help='GPU device idx to use.')
30
+ parser.add_argument('-cp', '--complete-prop', default='0.5', type=float,
31
+ help='originally complete proportions in the partially sample-missing data')
32
+ parser.add_argument('-m', '--margin', default='5', type=int, help='initial margin')
33
+ parser.add_argument('-s', '--start-fine', default=True, type=bool, help='flag to start use robust loss or not')
34
+ parser.add_argument('-np', '--neg-num', default='30', type=int, help='the ratio of negative to positive pairs')
35
+ parser.add_argument('-noise', '--noisy-training', type=bool, default=True,
36
+ help='training with real labels or noisy labels')
37
+ parser.add_argument('-r', '--robust', default=1, type=int, help='use our robust loss or not')
38
+
39
+ dim=0
40
+ class NoiseRobustLoss(nn.Module):
41
+ def __init__(self):
42
+ super(NoiseRobustLoss, self).__init__()
43
+
44
+ def forward(self, pair_dist, P, margin, use_robust_loss, args):
45
+ # print(max(pair_dist))
46
+ dist_sq = pair_dist * pair_dist
47
+ P = P.to(torch.float32)
48
+ N = len(P)
49
+ if use_robust_loss == 1:
50
+ if args.start_fine:
51
+ loss = P * dist_sq + (1 - P) * (1 / margin) * torch.pow(
52
+ torch.clamp(torch.pow(pair_dist, 0.5) * (0.5*margin - pair_dist), min=0.0), 2)
53
+ else:
54
+ loss = P * dist_sq + (1 - P) * torch.pow(torch.clamp(margin - pair_dist, min=0.0), 2)
55
+ else:
56
+ loss = P * dist_sq + (1 - P) * torch.pow(torch.clamp(margin - pair_dist, min=0.0), 2)
57
+ loss = torch.sum(loss) / (2.0 * N)
58
+ return loss
59
+ def load_data(align_prop,complete_prop,neg_num,is_noise,dataset):
60
+ global dim
61
+ NetSeed = random.randint(1, 1000)
62
+ # NetSeed=72
63
+ print(NetSeed)
64
+ np.random.seed(NetSeed)
65
+ torch.backends.cudnn.deterministic = True
66
+ torch.manual_seed(NetSeed) # 为CPU设置随机种子
67
+ torch.cuda.manual_seed(NetSeed) # 为当前GPU设置随机种子
68
+ args = parser.parse_args()
69
+ all_data = []
70
+ map_pairs = []
71
+ label = []
72
+ train_pairs = []
73
+
74
+ if dataset=='Caltech101_7':
75
+ path = './datasets/' + dataset + '.mat' # 路径
76
+ mat = mat73.loadmat(path) # 加载mat文件
77
+ else:
78
+ mat = sio.loadmat('./datasets/' + dataset + '.mat')
79
+ if dataset == 'Scene15':
80
+ data = mat['X'][0][0:2] # 20, 59 dimensions
81
+ label = np.squeeze(mat['Y'])
82
+ elif dataset == 'HandWritten':
83
+ data = mat['X'][0][1:3]
84
+ label = np.squeeze(mat['Y'])
85
+ elif dataset == '3Sources':
86
+ data = mat['X'][0][0:2]
87
+ label = np.squeeze(mat['Y'])
88
+ elif dataset == 'ALOI':
89
+ data = mat['X'][0][0:2]
90
+ label = np.squeeze(mat['gt'])
91
+ elif dataset == 'BBCsports':
92
+ data = mat['X'][0][0:2]
93
+ label = np.squeeze(mat['Y'])
94
+ elif dataset == 'Caltech101':
95
+ data = mat['X'][0][0:2]
96
+ label = np.squeeze(mat['Y'])
97
+ elif dataset == 'Reuters_dim10':
98
+ data = [] # 18758 samples
99
+ data.append(normalize(np.vstack((mat['x_train'][0], mat['x_test'][0]))))
100
+ data.append(normalize(np.vstack((mat['x_train'][1], mat['x_test'][1]))))
101
+ label = np.squeeze(np.hstack((mat['y_train'], mat['y_test'])))
102
+ elif dataset == 'ORL_mtv':
103
+ data = mat['X'][0][0:2]
104
+ label = np.squeeze(mat['gt'])
105
+ elif dataset == 'Caltech101_7':
106
+ data = mat['data'][3:5]
107
+ data[0], data[1] = np.squeeze(data[0]), np.squeeze(data[1])
108
+ data[0], data[1] = np.array(data[0]), np.array(data[1])
109
+ label = np.squeeze(mat['labels'])
110
+ elif dataset == 'Reuters':
111
+ data = mat['X'][0][0:2]
112
+ label = np.squeeze(mat['Y'])
113
+ elif dataset == '20NewsGroups':
114
+ data = mat['data'][0][1:3]
115
+ label = np.squeeze(mat['truelabel'][0][0])
116
+ elif dataset == '100leaves':
117
+ mat['data'][0][0], mat['data'][0][1] = mat['data'][0][0].T, mat['data'][0][1].T
118
+ data = mat['data'][0][0:2]
119
+ label = np.squeeze(mat['truelabel'][0][0])
120
+ elif dataset == 'BBC4':
121
+ data = mat['data'][0][0:2]
122
+ label = np.squeeze(mat['truelabel'][0][0])
123
+ # print(label)
124
+ elif dataset == 'MSRCv1':
125
+ data = mat['X'][0][1:3]
126
+ label = np.squeeze(mat['Y'])
127
+ elif dataset == 'BDGP':
128
+ mat['X'][0][0], mat['X'][0][1] = mat['X'][0][0].T, mat['X'][0][1].T
129
+ data = mat['X'][0][0:2]
130
+ label = np.squeeze(mat['gt'])
131
+ elif dataset == 'HandWritten':
132
+ data = mat['X'][0][1:3]
133
+ label = np.squeeze(mat['Y'])
134
+ elif dataset == 'yale_mtv':
135
+ mat['X'][0][0], mat['X'][0][1] = mat['X'][0][0].T, mat['X'][0][1].T
136
+ data = mat['X'][0][0:2]
137
+ # print((data))
138
+ label = np.squeeze(mat['gt'])
139
+ elif dataset == 'Wikipedia-test':
140
+ data = mat['X'][0:2][0:2]
141
+ data = np.squeeze(data.T)
142
+ # print(data)
143
+ label = np.squeeze(mat['y'])
144
+ elif dataset == 'Movies':
145
+ data = mat['X'][0:2][0:2]
146
+ data = np.squeeze(data.T)
147
+ # print(data)
148
+ label = np.squeeze(mat['y'])
149
+ elif dataset == 'Prokaryotic':
150
+ value1 = mat['X'][0][0]
151
+ value2 = mat['X'][2][0]
152
+ data = [value1, value2]
153
+ # print(data)
154
+ label = np.squeeze(mat['y'])
155
+ elif dataset == 'flower17':
156
+ data = mat['X'][0][0:2]
157
+ label = np.squeeze(mat['Y'])
158
+ divide_seed = random.randint(1, 1000)
159
+ train_idx, test_idx = TT_split(len(label), 1 - align_prop, divide_seed)
160
+ train_label, test_label = label[train_idx], label[test_idx]
161
+ if dataset == 'Caltech101_7':
162
+ data[0], data[1] = np.squeeze(data[0]), np.squeeze(data[1])
163
+ print(np.shape(data[0]))
164
+ train_X, train_Y, test_X, test_Y = data[0][train_idx], data[1][train_idx], data[0][test_idx], data[1][test_idx]
165
+ '''获取对齐部分的潜在表示'''
166
+ map_pairs.append(train_X)
167
+ map_pairs.append(train_Y)
168
+ h0 , h1,epoch_time=pretrain(map_pairs, args)
169
+ all_label = np.concatenate((train_label, test_label))
170
+ '''获取初始训练数据和测试数据'''
171
+ if align_prop != 1:
172
+ shuffle_idx = random.sample(range(len(test_Y)), len(test_Y))
173
+ test_Y = test_Y[shuffle_idx]
174
+ test_label_X, test_label_Y = test_label, test_label[shuffle_idx]
175
+ elif align_prop == 1:
176
+ all_data.append(train_X.T)
177
+ all_data.append(train_Y.T)
178
+ '''不完整部分'''
179
+ test_mask = get_sn(2, len(test_label), 1 - complete_prop)
180
+ X_mask, Y_mask = test_mask[:, 0].astype(np.bool_), test_mask[:, 1].astype(np.bool_)
181
+ # test_X[~X_mask] = 0
182
+ # test_Y[~Y_mask] = 0
183
+ test_X, test_Y = test_X[X_mask], test_Y[Y_mask]
184
+ test_label_X, test_label_Y=test_label_X[X_mask], test_label_Y[Y_mask]
185
+ if align_prop != 1:
186
+ all_label_X = np.concatenate((train_label, test_label_X))
187
+ all_label_Y = np.concatenate((train_label, test_label_Y))
188
+ all_data.append(np.concatenate((train_X, test_X)).T)
189
+ all_data.append(np.concatenate((train_Y, test_Y)).T)
190
+ all_label = np.concatenate((train_label, test_label))
191
+ # all_label_X = test_label_X
192
+ # all_label_Y = test_label_Y
193
+ # all_data.append(test_X.T)
194
+ # all_data.append(test_Y.T)
195
+ # all_label = test_label
196
+ elif align_prop == 1:
197
+ all_label_X, all_label_Y = train_label, train_label
198
+ all_label = train_label
199
+ '''构建训练对'''
200
+ view0, view1, noisy_labels, real_labels, _, _ = get_pairs(train_X, train_Y, neg_num, train_label)
201
+ count = 0
202
+ for i in range(len(noisy_labels)):
203
+ if noisy_labels[i] != real_labels[i]:
204
+ count += 1
205
+ print('noise rate of the constructed neg. pairs is ', round(count / (len(noisy_labels) - len(train_X)), 2))
206
+
207
+ if is_noise == 0: # training with real_labels, v/t with real_labels
208
+ print("----------------------Training with real_labels----------------------")
209
+ train_pair_labels = real_labels
210
+ else: # training with labels, v/t with real_labels
211
+ print("----------------------Training with noisy_labels----------------------")
212
+ train_pair_labels = noisy_labels
213
+ '''初始化锚点'''
214
+ num_unique_labels = np.unique(all_label).shape[0]
215
+
216
+ anchors0,anchors1,len_indices=get_anchors(h0,h1,map_pairs,num_unique_labels)#h0是tensor
217
+
218
+ '''数据重表示'''
219
+ view0,view1,all_data[0],all_data[1]=torch.from_numpy(view0).float(),torch.from_numpy(view1).float(),torch.from_numpy(all_data[0]).float(),torch.from_numpy(all_data[1]).float()
220
+
221
+ view0, view1, all_data[0],all_data[1]=find_nanchor(anchors0,view0),find_nanchor(anchors1,view1),find_nanchor(anchors0,all_data[0].T),find_nanchor(anchors1,all_data[1].T)
222
+ #锚点数×样本数,增强锚点图
223
+ view0, view1, all_data[0], all_data[1]=np.array(view0),np.array(view1),np.array(all_data[0]),np.array(all_data[1])
224
+ print(np.shape(view0),'view0')
225
+ train_pairs.append(view0)
226
+ train_pairs.append(view1)
227
+ train_pair_real_labels = real_labels
228
+ dim=view0.shape[0]
229
+ return train_pairs, train_pair_labels, train_pair_real_labels, all_data, all_label, all_label_X, all_label_Y, dim,num_unique_labels,divide_seed
230
+
231
+ def normalize(x):
232
+ x = (x - np.tile(np.min(x, axis=0), (x.shape[0], 1))) / np.tile((np.max(x, axis=0) - np.min(x, axis=0)),
233
+ (x.shape[0], 1))
234
+ return x
235
+ def loader(train_bs, align_prop, complete_prop,neg_num, is_noise, dataset):
236
+ """
237
+ :param train_bs: batch size for training, default is 1024
238
+ :param neg_prop: negative / positive pairs' ratio
239
+ :param test_prop: known aligned proportions for training MvCLN
240
+ :param is_noise: training with noisy labels or not, 0 --- not, 1 --- yes
241
+ :param data_idx: choice of dataset
242
+ :return: train_pair_loader including the constructed pos. and neg. pairs used for training MvCLN, all_loader including originally aligned and unaligned data used for testing MvCLN
243
+ """
244
+ train_pairs, train_pair_labels, train_pair_real_labels, all_data, all_label, all_label_X, all_label_Y, dim,num_unique_labels,divide_seed\
245
+ = load_data(align_prop,complete_prop,neg_num,is_noise, dataset)
246
+ train_pair_dataset = GetDataset(train_pairs, train_pair_labels, train_pair_real_labels)
247
+
248
+ train_pair_loader = DataLoader(
249
+ train_pair_dataset,
250
+ batch_size=train_bs,
251
+ shuffle=True,
252
+ drop_last=True
253
+ )
254
+ return train_pair_loader, all_data, all_label, all_label_X, all_label_Y, dim,num_unique_labels,divide_seed
255
+
256
+ if __name__ == '__main__':
257
+ for i in range(1):
258
+ args = parser.parse_args()
259
+ data_name = ['HandWritten', '3Sources', 'BBCsports', 'Scene15', 'Caltech101', 'ORL_mtv', 'Caltech101_7', 'Reuters',
260
+ '20NewsGroups','100leaves','BBC4','MSRCv1','BDGP','HandWritten','yale_mtv','Wikipedia-test','Movies','Prokaryotic','ALOI','flower17']
261
+ train_pair_loader, all_data, all_label, all_label_X, all_label_Y, dim, outfeature ,divide_seed=loader(args.batch_size, args.aligned_prop,args.complete_prop,args.neg_num,args.noisy_training,data_name[args.data])
262
+
263
+ model = Anchormodel(dim,outfeature).to(args.gpu)
264
+ criterion = NoiseRobustLoss().to(args.gpu)
265
+ # criterion_mse = nn.MSELoss().to(args.gpu)
266
+ optimizer = torch.optim.Adam(model.parameters(), lr=args.learn_rate)
267
+ CAR_list = []
268
+ acc_list, nmi_list, ari_list,f_list,f1_list,pre_list,pre2_list,rec_list,pur_list = [], [], [],[], [], [],[], [], []
269
+ train_time = 0
270
+ all_data[0], all_data[1]=torch.from_numpy(all_data[0]), torch.from_numpy(all_data[1])
271
+ for i in range(0, args.epochs + 1):
272
+ if i == 0:
273
+ with torch.no_grad():
274
+ epoch_time = train2(train_pair_loader, model, criterion, optimizer, i, args)
275
+ else:
276
+ epoch_time = train2(train_pair_loader, model, criterion, optimizer, i, args)
277
+ # test
278
+ v0, v1, pred_label, alignment_rate = tiny_infer(model, args.gpu, all_data, all_label_X, all_label_Y)
279
+ CAR_list.append(alignment_rate)
280
+ data = []
281
+ data.append(v0)
282
+ data.append(v1)
283
+
284
+ y_pred, ret, accuracy, nmi, ari, f_score, f_score2, precision, precision2, recall, purity = Clustering(data,
285
+ pred_label)
286
+ if i % 10 == 0:
287
+ print(accuracy, nmi, ari, f_score, f_score2, precision, precision2, recall, purity)
288
+ # logging.info("******** testing ********")
289
+ # logging.info(
290
+ # "CAR={} kmeans: acc={} nmi={} ari={}".format(round(alignment_rate, 4), ret['kmeans']['accuracy'],
291
+ # ret['kmeans']['NMI'], ret['kmeans']['ARI']))
292
+ acc_list.append(ret['kmeans']['ACC'])
293
+ nmi_list.append(ret['kmeans']['NMI'])
294
+ ari_list.append(ret['kmeans']['ARI'])
295
+ f_list.append(ret['kmeans']['F1'])
296
+ f1_list.append(ret['kmeans']['F2'])
297
+ pre_list.append(ret['kmeans']['PRE'])
298
+ pre2_list.append(ret['kmeans']['PRE2'])
299
+ rec_list.append(ret['kmeans']['REC'])
300
+ pur_list.append(ret['kmeans']['PUR'])
301
+ print('ACC:', max(acc_list))
302
+ print("NMI:", max(nmi_list))
303
+ print("ARI:", max(ari_list))
304
+ print("F1:", max(f_list))
305
+ print("F2:", max(f1_list))
306
+ print("PRE:", max(pre_list))
307
+ print("PRE2:", max(pre2_list))
308
+ print("REC:", max(rec_list))
309
+ print("PUR:", max(pur_list))
310
+ logging.info('******** End, training time = {} s ********'.format(round(train_time, 2)))
sample_kernal.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ from sklearn.metrics.pairwise import cosine_similarity
3
+
4
+ # 高斯核函数
5
+ def gaussian_kernel(x, x_i, bandwidth):
6
+ return np.exp(-0.5 * ((x - x_i) / bandwidth) ** 2)
7
+
8
+
9
+ # 核回归插值函数(支持多维)
10
+ def kernel_regression_multi_dim(x_known, y_known, x_targets, bandwidth):
11
+ """
12
+ x_known: 已知点的 x 坐标 (1D array)
13
+ y_known: 已知点的 y 值,多维数组 (2D array, shape: [n_samples, n_features])
14
+ x_target: 需要插值的 x 坐标 (scalar)
15
+ bandwidth: 核函数的带宽参数
16
+ """
17
+ # 计算核权重
18
+ y_targets = [] # 存储每个目标点的插值结果
19
+
20
+ for x_target in x_targets:
21
+ # 计算核权重
22
+ weights = np.array([gaussian_kernel(x_target, x_i, bandwidth) for x_i in x_known])
23
+ weights /= weights.sum() # 权重归一化
24
+
25
+ # 对每个维度分别插值
26
+ y_target = np.sum(weights[:, np.newaxis] * y_known, axis=0)
27
+ y_targets.append(y_target)
28
+
29
+ return np.array(y_targets)
30
+
31
+
32
+ def insert_and_sort(x_known, y_known, x_targets, y_targets):
33
+ # 合并数据
34
+ # print(np.shape(y_known))
35
+ # print(np.shape(y_targets))
36
+ x_combined = np.concatenate((x_known, x_targets))
37
+ y_combined = np.vstack((y_known, y_targets))
38
+
39
+ # 按 x_combined 排序
40
+ sorted_indices = np.argsort(x_combined)
41
+ x_known_sorted = x_combined[sorted_indices]
42
+ y_known_sorted = y_combined[sorted_indices]
43
+
44
+ return x_known_sorted, y_known_sorted
train_methods.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+
3
+ from model import SdA
4
+ from config import *
5
+ import torch.nn as nn
6
+ import torch
7
+ import time
8
+ import logging
9
+ import torch.nn.functional as F
10
+ def train1(train_pairs, model, criterion, optimizer, epoch, args):
11
+ if epoch % 10 == 0:
12
+ logging.info("=======> Train epoch: {}/{}".format(epoch, args.epochs))
13
+ model.train()
14
+ time0 = time.time()
15
+ loss_value = 0
16
+ x0,x1=torch.from_numpy(train_pairs[0]).float(),torch.from_numpy(train_pairs[1]).float()
17
+ x0, x1 = x0.to(args.gpu), x1.to(args.gpu)
18
+ # print(np.shape(x0))
19
+ try:
20
+ h0, h1, d0, d1 = model(x0, x1)
21
+ except:
22
+ print("error raise in batch",epoch)
23
+ #
24
+ # x0, x1 = torch.squeeze(x0), torch.squeeze(x1)
25
+ loss = criterion(x0, d0)
26
+ loss += criterion(x1, d1)
27
+ loss += model.regularization_loss()#l2正则化
28
+ loss_value += loss.item()
29
+ if epoch != 0:
30
+ optimizer.zero_grad()
31
+ loss.backward()
32
+ optimizer.step()
33
+ epoch_time = time.time() - time0
34
+
35
+ return h0 , h1,epoch_time
36
+ def pretrain(train_pairs, args):
37
+ model = SdA(config).to(args.gpu)
38
+ criterion = nn.MSELoss().to(args.gpu)
39
+ optimizer = torch.optim.Adam(model.parameters(), lr=args.learn_rate)
40
+ # 'train'
41
+ for i in range(0, args.epochs + 1):
42
+ if i == 0:
43
+ with torch.no_grad():
44
+ h0, h1, epoch_time = train1(train_pairs, model, criterion, optimizer, i, args)
45
+ else:
46
+ h0, h1, epoch_time = train1(train_pairs, model, criterion, optimizer, i, args)
47
+ return h0, h1, epoch_time
48
+
49
+ def train2(train_loader, model, criterion,optimizer, epoch, args):
50
+ model.train()
51
+ time0 = time.time()
52
+ loss_value = 0
53
+ for batch_idx, (x0, x1, labels, real_labels) in enumerate(train_loader):
54
+ # labels refer to noisy labels for the constructed pairs, while real_labels are the clean labels for these pairs
55
+ x0, x1, labels, real_labels = x0.to(args.gpu), x1.to(args.gpu), labels.to(args.gpu), real_labels.to(args.gpu)
56
+ print(np.shape(x0))
57
+ try:
58
+ h0, h1 = model(x0.view(x0.size()[0], -1), x1.view(x1.size()[0], -1))
59
+ except:
60
+ print("error raise in batch", batch_idx)
61
+
62
+ pair_dist = F.pairwise_distance(h0, h1)
63
+
64
+ loss = criterion(pair_dist, labels, args.margin, args.robust, args)
65
+ # loss1=criterion_mse(z0, z1)
66
+ # print(loss1,'loss')
67
+ loss_value += loss.item()
68
+ if epoch != 0:
69
+ optimizer.zero_grad()
70
+ loss.backward()
71
+ optimizer.step()
72
+ epoch_time = time.time() - time0
73
+ return epoch_time
utils.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import random
3
+ from sklearn.preprocessing import OneHotEncoder
4
+ from numpy.random import randint
5
+ import math
6
+ import torch
7
+ def TT_split(n_all, test_prop, seed):
8
+ '''
9
+ split data into training, testing dataset
10
+ '''
11
+ random.seed(seed)
12
+ random_idx = random.sample(range(n_all), n_all)
13
+ train_num = np.ceil((1-test_prop) * n_all).astype(int)
14
+ train_idx = random_idx[0:train_num]
15
+ test_num = np.floor(test_prop * n_all).astype(int)
16
+ test_idx = random_idx[-test_num:]
17
+ return train_idx, test_idx
18
+
19
+ def get_sn(view_num, alldata_len, missing_rate):
20
+ """Randomly generate incomplete data information, simulate partial view data with complete view data
21
+ :param view_num:view number
22
+ :param alldata_len:number of samples
23
+ :param missing_rate:Defined in section 4.3 of the paper
24
+ :return:Sn
25
+ """
26
+ missing_rate = missing_rate / 2
27
+ one_rate = 1.0 - missing_rate
28
+ if one_rate <= (1 / view_num):
29
+ enc = OneHotEncoder() # n_values=view_num
30
+ view_preserve = enc.fit_transform(randint(0, view_num, size=(alldata_len, 1))).toarray()
31
+ return view_preserve
32
+ error = 1
33
+ if one_rate == 1:
34
+ matrix = randint(1, 2, size=(alldata_len, view_num))
35
+ return matrix
36
+ max_iterations = 200 # 设置最大循环次数
37
+ iterations = 0 # 初始化循环次数
38
+
39
+ while error >= 0.005 and iterations < max_iterations:
40
+ enc = OneHotEncoder() # n_values=view_num
41
+ view_preserve = enc.fit_transform(randint(0, view_num, size=(alldata_len, 1))).toarray()#生成一个len^view的矩阵,矩阵每一行只有一个1
42
+ one_num = view_num * alldata_len * one_rate - alldata_len
43
+ ratio = one_num / (view_num * alldata_len)#0.25
44
+ matrix_iter = (randint(0, 100, size=(alldata_len, view_num)) < int(ratio * 100)).astype(int)
45
+ a = np.sum(((matrix_iter + view_preserve) > 1).astype(int))
46
+
47
+ one_num_iter = one_num / (1 - a / one_num)
48
+ ratio = one_num_iter / (view_num * alldata_len)
49
+ matrix_iter = (randint(0, 100, size=(alldata_len, view_num)) < int(ratio * 100)).astype(int)
50
+ matrix = ((matrix_iter + view_preserve) > 0).astype(int)
51
+ ratio = np.sum(matrix) / (view_num * alldata_len)
52
+ error = abs(one_rate - ratio)
53
+ iterations=iterations+1
54
+ return matrix
55
+
56
+ def cosineSimilartydis(A,B):
57
+ A=A/(torch.norm(A,dim=1,p=2,keepdim=True)+0.000001)
58
+ B=B/(torch.norm(B,dim=1,p=2,keepdim=True)+0.000001)
59
+
60
+ W=torch.mm(A,B.t())
61
+ max_values, _ = torch.max(W, axis=0)
62
+ min_values, _ = torch.min(W, axis=0)
63
+ denominator = max_values - min_values
64
+ denominator = torch.clamp(denominator, min=1e-6)
65
+ normalized_matrix = (W - min_values) / denominator
66
+ return 1-normalized_matrix
67
+
68
+ def find_nanchor(A,B):
69
+ print(A.device)
70
+ W=cosineSimilartydis(A, B)#表示距离
71
+ n = math.ceil(W.shape[0]/19)
72
+ # print(n)
73
+ # 复制矩阵A以避免修改原始矩阵
74
+ modified_matrix_A = W.clone()
75
+ print(modified_matrix_A.device,'de')
76
+ for col in range(modified_matrix_A.shape[1]):
77
+ min_indices = np.argpartition(modified_matrix_A[:, col], n)[:n]
78
+ modified_matrix_A[min_indices, col] = 0
79
+
80
+ return modified_matrix_A