Upload 13 files
Browse files- .gitattributes +2 -0
- 3Sources.mat +3 -0
- Cluster.py +284 -0
- Datasets.py +39 -0
- Prokaryotic.mat +3 -0
- SRW_KNN_greedy.py +219 -0
- anchors.py +22 -0
- config.py +85 -0
- data_loader.py +37 -0
- model.py +181 -0
- run.py +310 -0
- sample_kernal.py +44 -0
- train_methods.py +73 -0
- utils.py +80 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
3Sources.mat filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
Prokaryotic.mat filter=lfs diff=lfs merge=lfs -text
|
3Sources.mat
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0b56f4c3441fbaa0dc0281897851722c122bb00b24abdef15ff7e4c88dace833
|
| 3 |
+
size 112113
|
Cluster.py
ADDED
|
@@ -0,0 +1,284 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
import numpy as np
|
| 3 |
+
import torch.nn
|
| 4 |
+
from utils import cosineSimilartydis
|
| 5 |
+
from sklearn import metrics
|
| 6 |
+
import sklearn.metrics as metrics
|
| 7 |
+
from sklearn.cluster import KMeans
|
| 8 |
+
from munkres import Munkres
|
| 9 |
+
import sys
|
| 10 |
+
import logging
|
| 11 |
+
from sample_kernal import *
|
| 12 |
+
def tiny_infer(model, device, all_data, all_label_X, all_label_Y):
|
| 13 |
+
model.eval()
|
| 14 |
+
align_out0 = []
|
| 15 |
+
align_out1 = []
|
| 16 |
+
sort_value=[]
|
| 17 |
+
class_labels_cluster = []
|
| 18 |
+
len_alldata0 = all_data[0].shape[1]
|
| 19 |
+
len_alldata1 = all_data[1].shape[1]
|
| 20 |
+
# print(len_alldata0)
|
| 21 |
+
# print(len_alldata1)
|
| 22 |
+
len_map=max(len_alldata0, len_alldata1)
|
| 23 |
+
align_labels = torch.zeros(len_map)
|
| 24 |
+
if len_alldata0 > len_alldata1:
|
| 25 |
+
labels = all_label_Y
|
| 26 |
+
long_labels=all_label_X
|
| 27 |
+
test_num = len_alldata1
|
| 28 |
+
long_num= len_alldata0
|
| 29 |
+
else:
|
| 30 |
+
labels = all_label_X
|
| 31 |
+
long_labels = all_label_Y
|
| 32 |
+
test_num = len_alldata0
|
| 33 |
+
long_num = len_alldata1
|
| 34 |
+
labels = torch.from_numpy(labels)
|
| 35 |
+
with torch.no_grad():
|
| 36 |
+
x0, x1, labels = all_data[0].to(device), all_data[1].to(device), labels.to(device)
|
| 37 |
+
x0 = x0.view(x0.size()[0], -1).T
|
| 38 |
+
x1 = x1.view(x1.size()[0], -1).T
|
| 39 |
+
h0, h1 = model(x0, x1)
|
| 40 |
+
if len_alldata0 > len_alldata1:
|
| 41 |
+
C = cosineSimilartydis(h0, h1).T
|
| 42 |
+
C_temp=C.clone()
|
| 43 |
+
for i in range(test_num):
|
| 44 |
+
idx = torch.argsort(C[i, :])
|
| 45 |
+
sort_value.append(C_temp[i, idx[0]])
|
| 46 |
+
C[:, idx[0]] = float("inf")
|
| 47 |
+
align_out0.append((h1[i, :].cpu()).numpy())
|
| 48 |
+
align_out1.append((h0[idx[0], :].cpu()).numpy())#它和align0维度一样变小了
|
| 49 |
+
# if all_label_Y[i] == all_label_X[idx[0]]:
|
| 50 |
+
# align_labels[i] = 1
|
| 51 |
+
else:
|
| 52 |
+
C = cosineSimilartydis(h0, h1)
|
| 53 |
+
C_temp = C.clone()
|
| 54 |
+
for i in range(test_num):
|
| 55 |
+
idx = torch.argsort(C[i, :])
|
| 56 |
+
sort_value.append(C_temp[i, idx[0]])
|
| 57 |
+
C[:, idx[0]] = float("inf")
|
| 58 |
+
align_out0.append((h0[i, :].cpu()).numpy())
|
| 59 |
+
align_out1.append((h1[idx[0], :].cpu()).numpy())
|
| 60 |
+
# if all_label_X[i] == all_label_Y[idx[0]]:
|
| 61 |
+
# align_labels[i] = 1
|
| 62 |
+
|
| 63 |
+
sort_value,align_out0=torch.tensor(sort_value),torch.tensor(align_out0)
|
| 64 |
+
sorted_list, sorted_indice0 = torch.sort(sort_value)
|
| 65 |
+
sorted_indice0 = sorted_indice0.to(torch.long)
|
| 66 |
+
|
| 67 |
+
# 使用排序后的索引重新排列 align_out0
|
| 68 |
+
sorted_align0 = align_out0[sorted_indice0]
|
| 69 |
+
# 计算相邻元素之间的差值
|
| 70 |
+
differences = sorted_list[1:] - sorted_list[:-1]
|
| 71 |
+
x_known = np.arange(len(sorted_list))
|
| 72 |
+
# 生成索引
|
| 73 |
+
index_pairs = [(i, i + 1) for i in range(len(sorted_list) - 1)]
|
| 74 |
+
Xn=long_num-test_num
|
| 75 |
+
top_values, top_indices = torch.topk(differences, Xn)
|
| 76 |
+
# 使用 top_indices 获取对应的索引对
|
| 77 |
+
top_index_pairs = [index_pairs[i.item()] for i in top_indices]
|
| 78 |
+
average_indices = [(i + j) / 2 for i, j in top_index_pairs]
|
| 79 |
+
average_indices=np.array(average_indices)
|
| 80 |
+
bandwidth=1.0
|
| 81 |
+
index_pairs=np.array(index_pairs)
|
| 82 |
+
sorted_align0=np.array(sorted_align0)
|
| 83 |
+
|
| 84 |
+
A3_initial = kernel_regression_multi_dim(x_known, sorted_align0, average_indices, bandwidth)
|
| 85 |
+
x_known_sorted, y_sorted_align0 = insert_and_sort(x_known, sorted_align0, average_indices, A3_initial)
|
| 86 |
+
alignre0,alignre1=[],[]
|
| 87 |
+
y_sorted_align0=torch.tensor(y_sorted_align0).to('cuda')
|
| 88 |
+
y_sorted_align0=y_sorted_align0.float()
|
| 89 |
+
if len_alldata0 > len_alldata1:
|
| 90 |
+
Cre = cosineSimilartydis(h0,y_sorted_align0)
|
| 91 |
+
for i in range(long_num):
|
| 92 |
+
idx0 = torch.argsort(Cre[i, :])
|
| 93 |
+
Cre[:, idx0[0]] = float("inf")
|
| 94 |
+
alignre0.append((h0[i, :].cpu()).numpy())
|
| 95 |
+
alignre1.append((y_sorted_align0[idx[0], :].cpu()).numpy())
|
| 96 |
+
if all_label_X[i] == all_label_Y[idx[0]]:
|
| 97 |
+
align_labels[i] = 1
|
| 98 |
+
else:
|
| 99 |
+
Cre = cosineSimilartydis(h1,y_sorted_align0)
|
| 100 |
+
for i in range(long_num):
|
| 101 |
+
idx1 = torch.argsort(Cre[i, :])
|
| 102 |
+
Cre[:, idx1[0]] = float("inf")
|
| 103 |
+
alignre0.append((h1[i, :].cpu()).numpy())
|
| 104 |
+
alignre1.append((y_sorted_align0[idx[0], :].cpu()).numpy())
|
| 105 |
+
if all_label_Y[i] == all_label_X[idx[0]]:
|
| 106 |
+
align_labels[i] = 1
|
| 107 |
+
|
| 108 |
+
class_labels_cluster.extend(labels.cpu().numpy())
|
| 109 |
+
#
|
| 110 |
+
count = torch.sum(align_labels)
|
| 111 |
+
# print(test_num,'testnum')
|
| 112 |
+
inference_acc = count.item() / test_num
|
| 113 |
+
print(inference_acc)
|
| 114 |
+
print(np.shape(align_out1))
|
| 115 |
+
return np.array(alignre0), np.array(alignre1), np.array(class_labels_cluster), inference_acc
|
| 116 |
+
# return np.array(align_out0), np.array(align_out1), np.array(class_labels_cluster), inference_acc
|
| 117 |
+
def Clustering(x_list, y):
|
| 118 |
+
# logging.info('******** Clustering ********')
|
| 119 |
+
n_clusters = np.size(np.unique(y))
|
| 120 |
+
|
| 121 |
+
# np.random.seed(1)
|
| 122 |
+
|
| 123 |
+
x_final_concat = np.concatenate(x_list[:], axis=1)
|
| 124 |
+
kmeans_assignments, km = get_cluster_sols(x_final_concat, ClusterClass=KMeans, n_clusters=n_clusters,
|
| 125 |
+
init_args={'n_init': 10})
|
| 126 |
+
y_preds = get_y_preds(y, kmeans_assignments, n_clusters)
|
| 127 |
+
if np.min(y) == 1:
|
| 128 |
+
y = y - 1
|
| 129 |
+
scores, _ ,accuracy,nmi,ari,f_score,f_score2,precision,precision2,recall,purity= clustering_metric(y, kmeans_assignments, n_clusters)
|
| 130 |
+
|
| 131 |
+
ret = {}
|
| 132 |
+
ret['kmeans'] = scores
|
| 133 |
+
return y_preds, ret,accuracy,nmi,ari,f_score,f_score2,precision,precision2,recall,purity
|
| 134 |
+
|
| 135 |
+
def get_y_preds(y_true, cluster_assignments, n_clusters):
|
| 136 |
+
'''
|
| 137 |
+
Computes the predicted labels, where label assignments now
|
| 138 |
+
correspond to the actual labels in y_true (as estimated by Munkres)
|
| 139 |
+
|
| 140 |
+
cluster_assignments: array of labels, outputted by kmeans
|
| 141 |
+
y_true: true labels
|
| 142 |
+
n_clusters: number of clusters in the dataset
|
| 143 |
+
|
| 144 |
+
returns: a tuple containing the accuracy and confusion matrix,
|
| 145 |
+
in that order
|
| 146 |
+
'''
|
| 147 |
+
confusion_matrix = metrics.confusion_matrix(y_true, cluster_assignments, labels=None)
|
| 148 |
+
# compute accuracy based on optimal 1:1 assignment of clusters to labels
|
| 149 |
+
cost_matrix = calculate_cost_matrix(confusion_matrix, n_clusters)
|
| 150 |
+
indices = Munkres().compute(cost_matrix)
|
| 151 |
+
kmeans_to_true_cluster_labels = get_cluster_labels_from_indices(indices)
|
| 152 |
+
|
| 153 |
+
if np.min(cluster_assignments) != 0:
|
| 154 |
+
cluster_assignments = cluster_assignments - np.min(cluster_assignments)
|
| 155 |
+
y_pred = kmeans_to_true_cluster_labels[cluster_assignments]
|
| 156 |
+
return y_pred
|
| 157 |
+
|
| 158 |
+
def get_cluster_sols(x, cluster_obj=None, ClusterClass=None, n_clusters=None, init_args={}):
|
| 159 |
+
'''
|
| 160 |
+
Using either a newly instantiated ClusterClass or a provided
|
| 161 |
+
cluster_obj, generates cluster assignments based on input data
|
| 162 |
+
|
| 163 |
+
x: the points with which to perform clustering
|
| 164 |
+
cluster_obj: a pre-fitted instance of a clustering class
|
| 165 |
+
ClusterClass: a reference to the sklearn clustering class, necessary
|
| 166 |
+
if instantiating a new clustering class
|
| 167 |
+
n_clusters: number of clusters in the dataset, necessary
|
| 168 |
+
if instantiating new clustering class
|
| 169 |
+
init_args: any initialization arguments passed to ClusterClass
|
| 170 |
+
|
| 171 |
+
returns: a tuple containing the label assignments and the clustering object
|
| 172 |
+
'''
|
| 173 |
+
# if provided_cluster_obj is None, we must have both ClusterClass and n_clusters
|
| 174 |
+
assert not (cluster_obj is None and (ClusterClass is None or n_clusters is None))
|
| 175 |
+
cluster_assignments = None
|
| 176 |
+
if cluster_obj is None:
|
| 177 |
+
cluster_obj = ClusterClass(n_clusters, **init_args)
|
| 178 |
+
for _ in range(10):
|
| 179 |
+
try:
|
| 180 |
+
cluster_obj.fit(x)
|
| 181 |
+
break
|
| 182 |
+
except:
|
| 183 |
+
print("Unexpected error:", sys.exc_info())
|
| 184 |
+
else:
|
| 185 |
+
return np.zeros((len(x),)), cluster_obj
|
| 186 |
+
|
| 187 |
+
cluster_assignments = cluster_obj.predict(x)
|
| 188 |
+
return cluster_assignments, cluster_obj
|
| 189 |
+
|
| 190 |
+
def calculate_cost_matrix(C, n_clusters):
|
| 191 |
+
cost_matrix = np.zeros((n_clusters, n_clusters))
|
| 192 |
+
|
| 193 |
+
# cost_matrix[i,j] will be the cost of assigning cluster i to label j
|
| 194 |
+
for j in range(n_clusters):
|
| 195 |
+
s = np.sum(C[:, j]) # number of examples in cluster i
|
| 196 |
+
for i in range(n_clusters):
|
| 197 |
+
t = C[i, j]
|
| 198 |
+
cost_matrix[j, i] = s - t
|
| 199 |
+
return cost_matrix
|
| 200 |
+
|
| 201 |
+
|
| 202 |
+
def get_cluster_labels_from_indices(indices):
|
| 203 |
+
n_clusters = len(indices)
|
| 204 |
+
clusterLabels = np.zeros(n_clusters)
|
| 205 |
+
for i in range(n_clusters):
|
| 206 |
+
clusterLabels[i] = indices[i][1]
|
| 207 |
+
return clusterLabels
|
| 208 |
+
|
| 209 |
+
def clustering_metric(y_true, y_pred, n_clusters, verbose=False, decimals=4):
|
| 210 |
+
y_pred_ajusted = get_y_preds(y_true, y_pred, n_clusters)
|
| 211 |
+
|
| 212 |
+
classification_metrics, confusion_matrix = classification_metric(y_true, y_pred_ajusted)
|
| 213 |
+
accuracy = metrics.accuracy_score(y_true, y_pred_ajusted)
|
| 214 |
+
accuracy = np.round(accuracy, decimals)
|
| 215 |
+
# AMI
|
| 216 |
+
ami = metrics.adjusted_mutual_info_score(y_true, y_pred_ajusted)
|
| 217 |
+
ami = np.round(ami, decimals)
|
| 218 |
+
# NMI
|
| 219 |
+
nmi = metrics.normalized_mutual_info_score(y_true, y_pred_ajusted)
|
| 220 |
+
nmi = np.round(nmi, decimals)
|
| 221 |
+
# ARI
|
| 222 |
+
ari = metrics.adjusted_rand_score(y_true, y_pred_ajusted)
|
| 223 |
+
ari = np.round(ari, decimals)
|
| 224 |
+
#fscore
|
| 225 |
+
f_score = metrics.f1_score(y_true, y_pred_ajusted, average='macro')
|
| 226 |
+
f_score = np.round(f_score, decimals)
|
| 227 |
+
f_score2 = metrics.f1_score(y_true, y_pred_ajusted, average='weighted')
|
| 228 |
+
f_score2 = np.round(f_score2, decimals)
|
| 229 |
+
# precision
|
| 230 |
+
precision = metrics.precision_score(y_true, y_pred_ajusted, average='macro')
|
| 231 |
+
precision = np.round(precision, decimals)
|
| 232 |
+
precision2 = metrics.precision_score(y_true, y_pred_ajusted, average='weighted')
|
| 233 |
+
precision2 = np.round(precision2, decimals)
|
| 234 |
+
# recall
|
| 235 |
+
recall = metrics.recall_score(y_true, y_pred_ajusted, average='macro')
|
| 236 |
+
recall = np.round(recall, decimals)
|
| 237 |
+
# Purity
|
| 238 |
+
purity = Purity(y_true, y_pred_ajusted)
|
| 239 |
+
purity = np.round(purity, decimals)
|
| 240 |
+
# print(accuracy,nmi,ari,f_score,f_score2,precision,precision2,recall,purity,"zb")
|
| 241 |
+
# if verbose:
|
| 242 |
+
# logging.info('AMI: {}, NMI: {}, ARI: {}'.format(ami, nmi, ari))
|
| 243 |
+
# return dict({'AMI': ami, 'NMI': nmi, 'ARI': ari}, **classification_metrics), confusion_matrix,accuracy,nmi,ari,f_score,f_score2,precision,precision2,recall,purity
|
| 244 |
+
return dict({'ACC': accuracy,'AMI': ami, 'NMI': nmi, 'ARI': ari, 'F1': f_score, 'F2': f_score2, 'PRE': precision, 'PRE2': precision2, 'REC': recall, 'PUR': purity}), confusion_matrix, accuracy, nmi, ari, f_score, f_score2, precision, precision2, recall, purity
|
| 245 |
+
def Purity(y_true, y_pred):
|
| 246 |
+
y_voted_labels = np.zeros(y_true.shape)
|
| 247 |
+
labels = np.unique(y_true)
|
| 248 |
+
ordered_labels = np.arange(labels.shape[0])
|
| 249 |
+
for k in range(labels.shape[0]):
|
| 250 |
+
y_true[y_true == labels[k]] = ordered_labels[k]
|
| 251 |
+
labels = np.unique(y_true)
|
| 252 |
+
bins = np.concatenate((labels, [np.max(labels) + 1]), axis=0)
|
| 253 |
+
|
| 254 |
+
for cluster in np.unique(y_pred):
|
| 255 |
+
hist, _ = np.histogram(y_true[y_pred == cluster], bins=bins)
|
| 256 |
+
winner = np.argmax(hist)
|
| 257 |
+
y_voted_labels[y_pred == cluster] = winner
|
| 258 |
+
|
| 259 |
+
return metrics.accuracy_score(y_true, y_voted_labels)
|
| 260 |
+
|
| 261 |
+
def classification_metric(y_true, y_pred, average='macro', verbose=False, decimals=4):
|
| 262 |
+
# confusion matrix
|
| 263 |
+
confusion_matrix = metrics.confusion_matrix(y_true, y_pred)
|
| 264 |
+
# ACC
|
| 265 |
+
accuracy = metrics.accuracy_score(y_true, y_pred)
|
| 266 |
+
accuracy = np.round(accuracy, decimals)
|
| 267 |
+
|
| 268 |
+
# precision
|
| 269 |
+
precision = metrics.precision_score(y_true, y_pred, average=average)
|
| 270 |
+
precision = np.round(precision, decimals)
|
| 271 |
+
|
| 272 |
+
# recall
|
| 273 |
+
recall = metrics.recall_score(y_true, y_pred, average=average)
|
| 274 |
+
recall = np.round(recall, decimals)
|
| 275 |
+
|
| 276 |
+
# F-score
|
| 277 |
+
f_score = metrics.f1_score(y_true, y_pred, average=average)
|
| 278 |
+
f_score = np.round(f_score, decimals)
|
| 279 |
+
|
| 280 |
+
if verbose:
|
| 281 |
+
# print('Confusion Matrix')
|
| 282 |
+
# print(confusion_matrix)
|
| 283 |
+
logging.info('accuracy: {}, precision: {}, recall: {}, f_measure: {}'.format(accuracy, precision, recall, f_score))
|
| 284 |
+
return {'accuracy': accuracy, 'precision': precision, 'recall': recall, 'f_measure': f_score}, confusion_matrix
|
Datasets.py
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from torch.utils.data import Dataset, DataLoader
|
| 2 |
+
import numpy as np
|
| 3 |
+
import torch
|
| 4 |
+
class GetDataset(Dataset):
|
| 5 |
+
def __init__(self, data, labels, real_labels):
|
| 6 |
+
self.data = data
|
| 7 |
+
self.labels = labels
|
| 8 |
+
self.real_labels = real_labels
|
| 9 |
+
|
| 10 |
+
def __getitem__(self, index):
|
| 11 |
+
fea0, fea1 = torch.from_numpy(self.data[0][:, index]).float(), torch.from_numpy(self.data[1][:, index]).float()
|
| 12 |
+
fea0, fea1 = fea0.unsqueeze(0), fea1.unsqueeze(0)
|
| 13 |
+
label = np.int64(self.labels[index])
|
| 14 |
+
if len(self.real_labels) == 0:
|
| 15 |
+
return fea0, fea1, label
|
| 16 |
+
real_label = np.int64(self.real_labels[index])
|
| 17 |
+
return fea0, fea1, label, real_label
|
| 18 |
+
|
| 19 |
+
def __len__(self):
|
| 20 |
+
return len(self.labels)
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
class GetAllDataset(Dataset):
|
| 24 |
+
def __init__(self, data, labels, class_labels0, class_labels1):
|
| 25 |
+
self.data = data
|
| 26 |
+
self.labels = labels
|
| 27 |
+
self.class_labels0 = class_labels0
|
| 28 |
+
self.class_labels1 = class_labels1
|
| 29 |
+
|
| 30 |
+
def __getitem__(self, index):
|
| 31 |
+
fea0, fea1 = torch.from_numpy(self.data[0][:, index]).float(), torch.from_numpy(self.data[1][:, index]).float()
|
| 32 |
+
fea0, fea1 = fea0.unsqueeze(0), fea1.unsqueeze(0)
|
| 33 |
+
label = np.int64(self.labels[index])
|
| 34 |
+
class_labels0 = np.int64(self.class_labels0[index])
|
| 35 |
+
class_labels1 = np.int64(self.class_labels1[index])
|
| 36 |
+
return fea0, fea1, label, class_labels0, class_labels1
|
| 37 |
+
|
| 38 |
+
def __len__(self):
|
| 39 |
+
return len(self.labels)
|
Prokaryotic.mat
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:20471598a8c819fb35e94f102cda4300a3b4f2cd185d3cbe0ea06e0349a7ed7c
|
| 3 |
+
size 3105301
|
SRW_KNN_greedy.py
ADDED
|
@@ -0,0 +1,219 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
import torch
|
| 3 |
+
import umap
|
| 4 |
+
import matplotlib.pyplot as plt
|
| 5 |
+
from run import *
|
| 6 |
+
# 固定随机数种子,生成100个二维数据点
|
| 7 |
+
# torch.manual_seed(99)
|
| 8 |
+
def para(data,num_nodes,num_class):
|
| 9 |
+
similarity_threshold = 0.4 # 相似度阈值
|
| 10 |
+
num_anchors = num_class*2# 锚点数量
|
| 11 |
+
# num_anchors =26
|
| 12 |
+
distances = cosineSimilartydis(data, data)
|
| 13 |
+
# 排除对角线上的自身距离(0)的平均值
|
| 14 |
+
mean_distance = distances[~torch.eye(distances.size(0), dtype=torch.bool)].mean()
|
| 15 |
+
coverage_radius=mean_distance*0.3 # 贪心覆盖算法中的覆盖半径
|
| 16 |
+
#到时候写一个对齐数据少于锚点数量error的提示
|
| 17 |
+
if num_nodes < 100: # 小图
|
| 18 |
+
num_walks,walk_length = 20,3
|
| 19 |
+
elif num_nodes < 1000: # 中型图
|
| 20 |
+
num_walks,walk_length = 10,5
|
| 21 |
+
elif num_nodes < 10000: # 大型图
|
| 22 |
+
num_walks,walk_length = 5,10
|
| 23 |
+
else: # 超大图
|
| 24 |
+
num_walks,walk_length = 3,20
|
| 25 |
+
return num_walks,walk_length,similarity_threshold,num_anchors,coverage_radius
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def cosineSimilarty(A,B):
|
| 31 |
+
A=A/(torch.norm(A,dim=1,p=2,keepdim=True)+0.000001)
|
| 32 |
+
# A2 = A / (torch.norm(A, dim=0, p=2, keepdim=True) + 0.000001)
|
| 33 |
+
B=B/(torch.norm(B,dim=1,p=2,keepdim=True)+0.000001)
|
| 34 |
+
# B2 = B / (torch.norm(B, dim=0, p=2, keepdim=True) + 0.000001)
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
W=torch.mm(A,B.t())
|
| 38 |
+
max_values,_ = torch.max(W, axis=0)
|
| 39 |
+
min_values,_ = torch.min(W, axis=0)
|
| 40 |
+
normalized_matrix = (W - min_values) / (max_values - min_values)
|
| 41 |
+
normalized_matrix = torch.nan_to_num(normalized_matrix, nan=0.0001)
|
| 42 |
+
return normalized_matrix
|
| 43 |
+
|
| 44 |
+
def cosineSimilartydis(A,B):
|
| 45 |
+
A=A/(torch.norm(A,dim=1,p=2,keepdim=True)+0.000001)
|
| 46 |
+
B=B/(torch.norm(B,dim=1,p=2,keepdim=True)+0.000001)
|
| 47 |
+
|
| 48 |
+
W=torch.mm(A,B.t())
|
| 49 |
+
max_values, _ = torch.max(W, axis=0)
|
| 50 |
+
min_values, _ = torch.min(W, axis=0)
|
| 51 |
+
normalized_matrix = (W - min_values) / (max_values - min_values)
|
| 52 |
+
normalized_matrix = torch.nan_to_num(normalized_matrix, nan=0.0001)
|
| 53 |
+
return 1-normalized_matrix
|
| 54 |
+
# # 随机游走参数
|
| 55 |
+
#
|
| 56 |
+
#
|
| 57 |
+
# Step 1: 初始化完全图的转移概率矩阵
|
| 58 |
+
# distances = torch.cdist(data, data, p=2) # 计算所有点之间的欧几里得距离
|
| 59 |
+
# adj_matrix = torch.exp(-distances) # 高斯权重:距离越小权重越高
|
| 60 |
+
# def visit(data):
|
| 61 |
+
# adj_matrix=cosineSimilarty(data,data)
|
| 62 |
+
# print(np.shape(adj_matrix))
|
| 63 |
+
# adj_matrix.fill_diagonal_(0) # 去掉自身连接
|
| 64 |
+
# transition_matrix = adj_matrix / adj_matrix.sum(dim=1, keepdim=True) # 归一化为转移概率
|
| 65 |
+
# return transition_matrix
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
def visit(data, alpha=0.5):
|
| 69 |
+
"""
|
| 70 |
+
根据给定的节点特征矩阵data和参数alpha计算转移矩阵。
|
| 71 |
+
使用余弦相似度矩阵作为转移的相似度度量。
|
| 72 |
+
计算公式:r_mu(x_i) = (x_i / mu_i) ^ -alpha
|
| 73 |
+
"""
|
| 74 |
+
num_nodes = data.size(0)
|
| 75 |
+
|
| 76 |
+
# 计算节点间的余弦相似度矩阵
|
| 77 |
+
adj_matrix = cosineSimilarty(data, data)
|
| 78 |
+
|
| 79 |
+
# 归一化每一行,确保每行相似度和为1
|
| 80 |
+
adj_matrix.fill_diagonal_(0) # 去掉自身连接
|
| 81 |
+
adj_matrix = torch.nan_to_num(adj_matrix, nan=0.0001) # 防止NaN值
|
| 82 |
+
|
| 83 |
+
# 归一化为转移概率,确保每行的和为1
|
| 84 |
+
row_sums = adj_matrix.sum(dim=1, keepdim=True) + 0.000001 # 防止除以零
|
| 85 |
+
adj_matrix = adj_matrix / row_sums # 归一化为转移概率
|
| 86 |
+
|
| 87 |
+
# 防止出现概率为零的行(所有相似度为零时)
|
| 88 |
+
adj_matrix = torch.nan_to_num(adj_matrix, nan=0.0001) # 替换NaN为小值
|
| 89 |
+
adj_matrix = torch.clamp(adj_matrix, min=0.0001) # 防止小于0的概率值
|
| 90 |
+
|
| 91 |
+
# 根据 alpha 修改相似度矩阵
|
| 92 |
+
transition_matrix = adj_matrix ** (-alpha) # 应用公式 r_mu(x_i) = (x_i / mu_i) ^ -alpha
|
| 93 |
+
|
| 94 |
+
# 再次归一化转移矩阵,使得每行的和为1
|
| 95 |
+
transition_matrix = transition_matrix / (transition_matrix.sum(dim=1, keepdim=True) + 0.000001)
|
| 96 |
+
|
| 97 |
+
# 检查是否有行的和仍然为0,若有则设置为均匀分布
|
| 98 |
+
zero_rows = (transition_matrix.sum(dim=1) == 0)
|
| 99 |
+
if zero_rows.any():
|
| 100 |
+
transition_matrix[zero_rows] = 1.0 / num_nodes # 对于零行,设置均匀分布
|
| 101 |
+
|
| 102 |
+
return transition_matrix
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
# 优化方法
|
| 106 |
+
def random_walk_batch_paths(transition_matrix, num_walks, walk_length):
|
| 107 |
+
"""
|
| 108 |
+
批量化生成随机游走路径,并统计访问频次。
|
| 109 |
+
"""
|
| 110 |
+
num_nodes = transition_matrix.size(0)
|
| 111 |
+
visit_matrix = torch.zeros_like(transition_matrix,device='cuda') # 初始化访问频率矩阵
|
| 112 |
+
for start_node in range(num_nodes): # 遍历每个起始节点
|
| 113 |
+
# 初始化起点
|
| 114 |
+
paths = torch.full((num_walks, walk_length + 1), start_node, dtype=torch.long,device='cuda') # 每行一条路径
|
| 115 |
+
for step in range(walk_length): # 生成完整路径
|
| 116 |
+
|
| 117 |
+
probs = transition_matrix[paths[:, step]] # 当前步节点的转移概率
|
| 118 |
+
next_nodes = torch.multinomial(probs, 1).squeeze() # 采样下一个节点
|
| 119 |
+
paths[:, step + 1] = next_nodes
|
| 120 |
+
|
| 121 |
+
# 累计所有路径的访问频率
|
| 122 |
+
for path in paths:
|
| 123 |
+
visit_matrix[start_node].index_add_(0, path, torch.ones_like(path, dtype=torch.float,device='cuda'))
|
| 124 |
+
visit_matrix -= torch.diag(torch.full((num_nodes,), num_walks, dtype=visit_matrix.dtype,device='cuda'))
|
| 125 |
+
return visit_matrix
|
| 126 |
+
|
| 127 |
+
|
| 128 |
+
# visit_matrix = random_walk_batch_paths(transition_matrix, num_walks, walk_length)
|
| 129 |
+
#
|
| 130 |
+
# # visit_matrix = random_walk_parallel(transition_matrix, num_walks, walk_length)
|
| 131 |
+
#
|
| 132 |
+
# Step 3: 归一化访问频率为相似度,构建基于阈值的 kNN 图
|
| 133 |
+
def thresholded_knn(visit_matrix,similarity_threshold):
|
| 134 |
+
similarity_matrix = visit_matrix / visit_matrix.max()
|
| 135 |
+
thresholded_adj = (similarity_matrix > similarity_threshold).float() # 保留相似度大于阈值的边
|
| 136 |
+
return thresholded_adj
|
| 137 |
+
# # Step 5: 贪心覆盖算法选择锚点
|
| 138 |
+
def greedy_cover_with_importance(data, importance_scores, r, num_anchors):
|
| 139 |
+
"""
|
| 140 |
+
贪心覆盖算法用于选择锚点
|
| 141 |
+
:param data: 数据点,形状为 (n_samples, n_features)
|
| 142 |
+
:param importance_scores: 每个点的重要性分数 (随机游走访问频率)
|
| 143 |
+
:param r: 覆盖半径
|
| 144 |
+
:param num_anchors: 需要选择的锚点数量
|
| 145 |
+
:return: 锚点索引
|
| 146 |
+
"""
|
| 147 |
+
distances = cosineSimilartydis(data,data) # 计算点对点距离
|
| 148 |
+
selected = [] # 选择的锚点索引
|
| 149 |
+
covered = torch.zeros(data.size(0), dtype=torch.bool,device='cuda') # 覆盖标志位
|
| 150 |
+
sorted_indices = torch.argsort(importance_scores, descending=True) # 按重要性排序
|
| 151 |
+
cluster_selected = torch.zeros(data.size(0), dtype=torch.bool, device='cuda') # 集群是否被选中锚点标记
|
| 152 |
+
|
| 153 |
+
while len(selected) < num_anchors:
|
| 154 |
+
# prev_covered_sum = covered.sum().item() # 上一次覆盖点的数量
|
| 155 |
+
|
| 156 |
+
for idx in sorted_indices:
|
| 157 |
+
if len(selected) >= num_anchors:
|
| 158 |
+
break
|
| 159 |
+
if not covered[idx] and not cluster_selected[idx]: # 如果当前点未被覆盖,且所属集群未选过锚点
|
| 160 |
+
selected.append(idx) # 选择锚点
|
| 161 |
+
|
| 162 |
+
cluster_selected[idx] = 1 # 标记所属集群已选锚点
|
| 163 |
+
# 将当前锚点覆盖范围内的点标记为已覆盖
|
| 164 |
+
covered |= distances[idx] <= r
|
| 165 |
+
covered[idx] = 1#调了半天,锚点自己没有被覆盖
|
| 166 |
+
selected_anchors = set(selected) # 当前已选择的锚点集合
|
| 167 |
+
selected_anchors_tensor = torch.tensor(list(selected_anchors), device='cuda')
|
| 168 |
+
# 检查是否所有集群都已被选过锚点
|
| 169 |
+
if covered.sum().item() == data.size(0):
|
| 170 |
+
print("所有点已被覆盖,重置覆盖状态")
|
| 171 |
+
# 记录已选的锚点,重置覆盖标志
|
| 172 |
+
covered[:] = 0
|
| 173 |
+
covered[selected_anchors_tensor] = 1 # 恢复已选锚点的覆盖状态
|
| 174 |
+
print(len(selected))
|
| 175 |
+
# elif covered.sum().item() == prev_covered_sum:
|
| 176 |
+
# print("没有新的点被覆盖,终止选择锚点")
|
| 177 |
+
# break # 如果没有新点被覆盖,跳出循环
|
| 178 |
+
return torch.tensor(selected,device='cuda')
|
| 179 |
+
|
| 180 |
+
# 计算节点的重要性(访问频率的总和)
|
| 181 |
+
# node_importance = visit_matrix.sum(dim=1)
|
| 182 |
+
#
|
| 183 |
+
# # 使用贪心覆盖算法选择锚点
|
| 184 |
+
# anchor_indices = greedy_cover_with_importance(data, node_importance, coverage_radius, num_anchors)
|
| 185 |
+
# anchors = data[anchor_indices] # 提取锚点
|
| 186 |
+
|
| 187 |
+
# # Step 6: 可视化结果
|
| 188 |
+
# # from sklearn.decomposition import PCA
|
| 189 |
+
# # import matplotlib.pyplot as plt
|
| 190 |
+
# #
|
| 191 |
+
# # # 假设 data 和 anchors 是 5维张量
|
| 192 |
+
# # pca = PCA(n_components=2, random_state=42)
|
| 193 |
+
# #
|
| 194 |
+
# # # 降维到 2D
|
| 195 |
+
# # data_2d = pca.fit_transform(data.detach().cpu().numpy())
|
| 196 |
+
# # anchors_2d = pca.transform(anchors.detach().cpu().numpy())
|
| 197 |
+
# #
|
| 198 |
+
# # # 绘制统一显示的散点图
|
| 199 |
+
# # plt.figure(figsize=(8, 8))
|
| 200 |
+
# # plt.scatter(data_2d[:, 0], data_2d[:, 1], c='blue', label="Data Points", alpha=0.5, s=30)
|
| 201 |
+
# # plt.scatter(anchors_2d[:, 0], anchors_2d[:, 1], color="red", label="Anchor Points", s=100, edgecolor='black')
|
| 202 |
+
# # plt.title("Unified Visualization with PCA")
|
| 203 |
+
# # plt.legend()
|
| 204 |
+
# # plt.show()
|
| 205 |
+
#
|
| 206 |
+
#
|
| 207 |
+
#
|
| 208 |
+
# # 使用 UMAP 降维
|
| 209 |
+
# reducer = umap.UMAP(n_components=2)
|
| 210 |
+
# data_2d = reducer.fit_transform(data.detach().cpu().numpy())
|
| 211 |
+
# anchors_2d = reducer.transform(anchors.detach().cpu().numpy())
|
| 212 |
+
#
|
| 213 |
+
# # 绘制统一显示图
|
| 214 |
+
# plt.figure(figsize=(8, 8))
|
| 215 |
+
# plt.scatter(data_2d[:, 0], data_2d[:, 1], c='blue', label="Data Points", alpha=0.5, s=30)
|
| 216 |
+
# plt.scatter(anchors_2d[:, 0], anchors_2d[:, 1], color="red", label="Anchor Points", s=20, edgecolor='black')
|
| 217 |
+
# plt.title("Unified Visualization with UMAP")
|
| 218 |
+
# plt.legend()
|
| 219 |
+
# plt.show()
|
anchors.py
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from SRW_KNN_greedy import *
|
| 2 |
+
import torch
|
| 3 |
+
def get_anchors(h0,h1,map_pairs,num_unique_labels):
|
| 4 |
+
#又一个点,提前预训练,统一样本的潜在空间
|
| 5 |
+
# print(h0.shape[0],num_unique_labels,'ghjhggjf')
|
| 6 |
+
#初始化随机游走参数
|
| 7 |
+
num_walks0, walk_length0, similarity_threshold0, num_anchors0, coverage_radius0 = para(h0,h0.shape[0],num_unique_labels)
|
| 8 |
+
num_walks1, walk_length1, similarity_threshold1, num_anchors1, coverage_radius1 = para(h1, h1.shape[0],num_unique_labels)
|
| 9 |
+
transition_matrix0,transition_matrix1 = visit(h0),visit(h1)#转移概率矩阵
|
| 10 |
+
#访问矩阵
|
| 11 |
+
visit_matrix0,visit_matrix1 = random_walk_batch_paths(transition_matrix0, num_walks0, walk_length0), random_walk_batch_paths(transition_matrix1, num_walks1, walk_length1)
|
| 12 |
+
#
|
| 13 |
+
node_importance0, node_importance1 = visit_matrix0.sum(dim=0),visit_matrix1.sum(dim=0)
|
| 14 |
+
# # 使用贪心覆盖算法选择锚点
|
| 15 |
+
anchor_indices0 = greedy_cover_with_importance(h0, node_importance0, coverage_radius0, num_anchors0)
|
| 16 |
+
anchor_indices1 = greedy_cover_with_importance(h1, node_importance1, coverage_radius1, num_anchors1)
|
| 17 |
+
combined_indices = torch.cat((anchor_indices0, anchor_indices1))
|
| 18 |
+
unique_indices = torch.unique(combined_indices)#合并索引去重
|
| 19 |
+
len_indices=len(unique_indices)
|
| 20 |
+
mapdata0,mapdata1=torch.tensor(map_pairs[0]),torch.tensor(map_pairs[1])
|
| 21 |
+
anchors0,anchors1 = mapdata0[unique_indices].float(),mapdata1[unique_indices].float()# 提取锚点(降维前)
|
| 22 |
+
return anchors0,anchors1,len_indices
|
config.py
ADDED
|
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from easydict import EasyDict
|
| 2 |
+
config = EasyDict()
|
| 3 |
+
from run import dim
|
| 4 |
+
'''3Sources'''
|
| 5 |
+
config.input_features1 =3560
|
| 6 |
+
config.input_features2 =3631
|
| 7 |
+
config.enhidden_features = [2000, 320, 50,6]
|
| 8 |
+
config.dehidden_features1 = [50, 320, 2000,3560]
|
| 9 |
+
config.dehidden_features2 = [50, 320, 2000,3631]
|
| 10 |
+
config.classes = 6
|
| 11 |
+
'''BBCsports'''
|
| 12 |
+
# config.input_features1 =2582
|
| 13 |
+
# config.input_features2 =2544
|
| 14 |
+
# config.enhidden_features = [1500, 200, 50,5]
|
| 15 |
+
# config.dehidden_features1 = [50, 200, 1500,2582]
|
| 16 |
+
# config.dehidden_features2 = [50, 200, 1500,2544]
|
| 17 |
+
# config.classes = 5
|
| 18 |
+
'''Caltech101'''
|
| 19 |
+
# config.input_features1 =1984
|
| 20 |
+
# config.input_features2 =512
|
| 21 |
+
# config.enhidden_features = [500, 320, 50,10]
|
| 22 |
+
# config.dehidden_features1 = [50, 320, 500,1984]
|
| 23 |
+
# config.dehidden_features2 = [50, 320, 500,512]
|
| 24 |
+
# config.classes = 20
|
| 25 |
+
'''ORL_mtv'''
|
| 26 |
+
# config.input_features1 =400
|
| 27 |
+
# config.input_features2 =400
|
| 28 |
+
# config.enhidden_features = [300, 150, 50,10]
|
| 29 |
+
# config.dehidden_features1 = [50, 150, 300,400]
|
| 30 |
+
# config.dehidden_features2 = [50, 150, 300,400]
|
| 31 |
+
# config.classes = 40
|
| 32 |
+
'''Caltech101_7'''
|
| 33 |
+
# config.input_features1 =1984
|
| 34 |
+
# config.input_features2 =512
|
| 35 |
+
# config.enhidden_features = [500, 320, 50,5]
|
| 36 |
+
# config.dehidden_features1 = [50, 320, 500,1984]
|
| 37 |
+
# config.dehidden_features2 = [50, 320, 500,512]
|
| 38 |
+
# config.classes = 7
|
| 39 |
+
'''scene15'''
|
| 40 |
+
# config.input_features1 =20
|
| 41 |
+
# config.input_features2 =59
|
| 42 |
+
# config.enhidden_features = [20, 15, 15,10]
|
| 43 |
+
# config.dehidden_features1 = [15, 15, 20,20]
|
| 44 |
+
# config.dehidden_features2 = [15, 15, 20,59]
|
| 45 |
+
# config.classes = 10
|
| 46 |
+
'''Prokaryotic'''
|
| 47 |
+
# config.input_features1 =393
|
| 48 |
+
# config.input_features2 =438
|
| 49 |
+
# config.enhidden_features = [300, 150, 50,10]
|
| 50 |
+
# config.dehidden_features1 = [50, 150, 300,393]
|
| 51 |
+
# config.dehidden_features2 = [50, 150, 300,438]
|
| 52 |
+
# config.classes = 4
|
| 53 |
+
'''yale_mtv'''
|
| 54 |
+
# config.input_features1 =4096
|
| 55 |
+
# config.input_features2 =3304
|
| 56 |
+
# config.enhidden_features = [1500, 200, 50,5]
|
| 57 |
+
# config.dehidden_features1 = [50, 200, 1500,4096]
|
| 58 |
+
# config.dehidden_features2 = [50, 200, 1500,3304]
|
| 59 |
+
# config.classes = 15
|
| 60 |
+
'''flower17'''
|
| 61 |
+
# config.input_features1 =1360
|
| 62 |
+
# config.input_features2 =1360
|
| 63 |
+
# config.enhidden_features = [1000, 200, 50,5]
|
| 64 |
+
# config.dehidden_features1 = [50, 200, 1000,1360]
|
| 65 |
+
# config.dehidden_features2 = [50, 200, 1000,1360]
|
| 66 |
+
# config.classes = 17
|
| 67 |
+
'''100leaves'''
|
| 68 |
+
# config.input_features1 =64
|
| 69 |
+
# config.input_features2 =64
|
| 70 |
+
# config.enhidden_features = [200, 200, 50,10]
|
| 71 |
+
# config.dehidden_features1 = [50, 200, 200,64]
|
| 72 |
+
# config.dehidden_features2 = [50, 200, 200,64]
|
| 73 |
+
# config.classes = 100
|
| 74 |
+
|
| 75 |
+
config.lr = 1e-3
|
| 76 |
+
config.momentum = 0.9#SGD才有的参数,动量通过利用过去梯度的加权平均值来调整当前梯度的方向,避免震荡
|
| 77 |
+
config.weight_decay = 0
|
| 78 |
+
config.w_v = 0
|
| 79 |
+
|
| 80 |
+
config.print_step = 10
|
| 81 |
+
config.tensorboard_step = 100
|
| 82 |
+
config.load_iter = 0
|
| 83 |
+
config.train_iters = 5000
|
| 84 |
+
config.is_train = True
|
| 85 |
+
config.use_cuda = True
|
data_loader.py
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import mat73
|
| 2 |
+
import numpy as np
|
| 3 |
+
import scipy.io as sio
|
| 4 |
+
import torch
|
| 5 |
+
import random
|
| 6 |
+
from torch.utils.data import Dataset, DataLoader
|
| 7 |
+
from utils import *
|
| 8 |
+
def get_pairs(E_X, E_Y, neg_prop, train_label):
|
| 9 |
+
view0, view1, labels, real_labels, class_labels0, class_labels1 = [], [], [], [], [], []
|
| 10 |
+
# construct pos. pairs
|
| 11 |
+
for i in range(len(E_X)):
|
| 12 |
+
view0.append(E_X[i])
|
| 13 |
+
view1.append(E_Y[i])
|
| 14 |
+
labels.append(1)
|
| 15 |
+
real_labels.append(1)
|
| 16 |
+
class_labels0.append(train_label[i])
|
| 17 |
+
class_labels1.append(train_label[i])
|
| 18 |
+
# construct neg. pairs by taking each sample in view0 as an anchor and randomly sample neg_prop samples from view1,
|
| 19 |
+
# which may lead to the so called noisy labels, namely, some of the constructed neg. pairs may in the same category.
|
| 20 |
+
for j in range(len(E_X)):
|
| 21 |
+
neg_idx = random.sample(range(len(E_Y)), neg_prop)
|
| 22 |
+
for k in range(neg_prop):
|
| 23 |
+
view0.append(E_X[j])
|
| 24 |
+
view1.append(E_Y[neg_idx[k]])
|
| 25 |
+
labels.append(0)
|
| 26 |
+
class_labels0.append(train_label[j])
|
| 27 |
+
class_labels1.append(train_label[neg_idx[k]])
|
| 28 |
+
if train_label[j] != train_label[neg_idx[k]]:
|
| 29 |
+
real_labels.append(0)
|
| 30 |
+
else:
|
| 31 |
+
real_labels.append(1)
|
| 32 |
+
|
| 33 |
+
labels = np.array(labels, dtype=np.int64)
|
| 34 |
+
real_labels = np.array(real_labels, dtype=np.int64)
|
| 35 |
+
class_labels0, class_labels1 = np.array(class_labels0, dtype=np.int64), np.array(class_labels1, dtype=np.int64)
|
| 36 |
+
view0, view1 = np.array(view0, dtype=np.float32), np.array(view1, dtype=np.float32)
|
| 37 |
+
return view0, view1, labels, real_labels, class_labels0, class_labels1
|
model.py
ADDED
|
@@ -0,0 +1,181 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch.nn as nn
|
| 2 |
+
from torch import optim
|
| 3 |
+
|
| 4 |
+
class endA(nn.Module):
|
| 5 |
+
def __init__(self, in_features, out_features):
|
| 6 |
+
super(endA, self).__init__()
|
| 7 |
+
|
| 8 |
+
self.encoder = nn.Sequential(
|
| 9 |
+
nn.Linear(in_features, out_features),
|
| 10 |
+
nn.BatchNorm1d(out_features),
|
| 11 |
+
nn.ReLU(True),
|
| 12 |
+
nn.Dropout(0.1)
|
| 13 |
+
) # 编码
|
| 14 |
+
|
| 15 |
+
#self.decoder[0].weight.data = self.encoder[0].weight.data.transpose(0, 1)
|
| 16 |
+
|
| 17 |
+
def forward(self, x):
|
| 18 |
+
h = self.encoder(x)
|
| 19 |
+
return h
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
class dedA(nn.Module):
|
| 23 |
+
def __init__(self, out_features, in_features):
|
| 24 |
+
super(dedA, self).__init__()
|
| 25 |
+
|
| 26 |
+
self.decoder = nn.Sequential(
|
| 27 |
+
nn.Linear(out_features, in_features),
|
| 28 |
+
nn.ReLU(True)
|
| 29 |
+
) # 编码
|
| 30 |
+
|
| 31 |
+
# self.decoder[0].weight.data = self.encoder[0].weight.data.transpose(0, 1)
|
| 32 |
+
|
| 33 |
+
def forward(self, x):
|
| 34 |
+
h = self.decoder(x)
|
| 35 |
+
return h
|
| 36 |
+
|
| 37 |
+
class SdA(nn.Module):
|
| 38 |
+
def __init__(self, config):
|
| 39 |
+
super(SdA, self).__init__()
|
| 40 |
+
|
| 41 |
+
layers1 = []
|
| 42 |
+
layers2 = []
|
| 43 |
+
layersall1=[]
|
| 44 |
+
layersall2 = []
|
| 45 |
+
in_features1 = config.input_features1
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
for out_features in config.enhidden_features:
|
| 49 |
+
layer1 = endA(in_features1, out_features)
|
| 50 |
+
in_features1 = out_features
|
| 51 |
+
layers1.append(layer1)
|
| 52 |
+
|
| 53 |
+
self.layers1 = nn.Sequential(*layers1) # 就是封装了成了一个
|
| 54 |
+
|
| 55 |
+
in_features=config.enhidden_features[-1]
|
| 56 |
+
for out_features in config.dehidden_features1:
|
| 57 |
+
layer2 = dedA(in_features, out_features)
|
| 58 |
+
in_features = out_features
|
| 59 |
+
layers2.append(layer2)
|
| 60 |
+
|
| 61 |
+
self.layers2=nn.Sequential(*layers2)
|
| 62 |
+
|
| 63 |
+
layersall1.append(self.layers1)
|
| 64 |
+
layersall1.append(self.layers2)
|
| 65 |
+
self.layerll1=nn.Sequential(*layersall1)
|
| 66 |
+
|
| 67 |
+
if config.is_train:
|
| 68 |
+
self.ce_criterion = nn.CrossEntropyLoss()
|
| 69 |
+
self.da_optimizers = []
|
| 70 |
+
for layer1 in self.layers1[:-1]:
|
| 71 |
+
# optimizer = optim.SGD(layer1.parameters(), lr=config.lr,
|
| 72 |
+
# momentum=config.momentum, weight_decay=config.weight_decay) # 优化器可以改一下
|
| 73 |
+
optimizer = optim.Adam(
|
| 74 |
+
layer1.parameters(), lr=0.001, betas=(0.9, 0.99), eps=1e-8, weight_decay=0)
|
| 75 |
+
self.da_optimizers.append(optimizer)
|
| 76 |
+
|
| 77 |
+
layers3 = []
|
| 78 |
+
layers4 = []
|
| 79 |
+
in_features2 = config.input_features2
|
| 80 |
+
for out_features in config.enhidden_features:
|
| 81 |
+
layer3 = endA(in_features2, out_features)
|
| 82 |
+
in_features2 = out_features
|
| 83 |
+
layers3.append(layer3)
|
| 84 |
+
|
| 85 |
+
self.layers3 = nn.Sequential(*layers3) # 就是封装了成了一个
|
| 86 |
+
|
| 87 |
+
in_features=config.enhidden_features[-1]
|
| 88 |
+
for out_features in config.dehidden_features2:
|
| 89 |
+
layer4 = dedA(in_features, out_features)
|
| 90 |
+
in_features = out_features
|
| 91 |
+
layers4.append(layer4)
|
| 92 |
+
|
| 93 |
+
self.layers4=nn.Sequential(*layers4)
|
| 94 |
+
|
| 95 |
+
layersall2.append(self.layers3)
|
| 96 |
+
layersall2.append(self.layers4)
|
| 97 |
+
self.layerll2 = nn.Sequential(*layersall2)
|
| 98 |
+
# for layer in self.layers3:
|
| 99 |
+
# print(layer)
|
| 100 |
+
|
| 101 |
+
if config.is_train:
|
| 102 |
+
self.ce_criterion = nn.CrossEntropyLoss()
|
| 103 |
+
self.da_optimizers = []
|
| 104 |
+
for layer1 in self.layers3[:-1]:
|
| 105 |
+
# optimizer = optim.SGD(layer1.parameters(), lr=config.lr,
|
| 106 |
+
# momentum=config.momentum, weight_decay=config.weight_decay) # 优化器可以改一下
|
| 107 |
+
optimizer=optim.Adam(layer1.parameters(),lr=0.001,betas=(0.9,0.99),eps=1e-8,weight_decay=0)
|
| 108 |
+
self.da_optimizers.append(optimizer)
|
| 109 |
+
# 每一层的优化器
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
def forward(self, x1, x2):
|
| 113 |
+
h1, h2 = x1, x2
|
| 114 |
+
for layer1 in self.layers1:
|
| 115 |
+
h1 = layer1(h1)
|
| 116 |
+
h3 = h1
|
| 117 |
+
for layer2 in self.layers2:
|
| 118 |
+
h3 = layer2(h3)
|
| 119 |
+
for layer3 in self.layers3:
|
| 120 |
+
h2 = layer3(h2)
|
| 121 |
+
h4=h2
|
| 122 |
+
for layer4 in self.layers4:
|
| 123 |
+
h4 = layer4(h4)
|
| 124 |
+
return h1, h2, h3, h4 # 不是很理解构
|
| 125 |
+
|
| 126 |
+
def regularization_loss(self):
|
| 127 |
+
l2_lambda = 0.001
|
| 128 |
+
l2_norm = sum(p.pow(2).sum() for p in self.parameters())
|
| 129 |
+
return l2_lambda * l2_norm
|
| 130 |
+
|
| 131 |
+
|
| 132 |
+
class Anchormodel(nn.Module):
|
| 133 |
+
def __init__(self,dim,outfeature):
|
| 134 |
+
super(Anchormodel, self).__init__()
|
| 135 |
+
self.encoder0 = nn.Sequential(
|
| 136 |
+
nn.Linear(dim, 1024),
|
| 137 |
+
nn.BatchNorm1d(1024),
|
| 138 |
+
nn.ReLU(True),
|
| 139 |
+
nn.Dropout(0.2),
|
| 140 |
+
nn.Linear(1024, 1024),
|
| 141 |
+
nn.BatchNorm1d(1024),
|
| 142 |
+
nn.ReLU(True),
|
| 143 |
+
nn.Dropout(0.2),
|
| 144 |
+
nn.Linear(1024, 1024),
|
| 145 |
+
nn.BatchNorm1d(1024),
|
| 146 |
+
nn.ReLU(True),
|
| 147 |
+
nn.Dropout(0.2),
|
| 148 |
+
nn.Linear(1024, outfeature),
|
| 149 |
+
nn.BatchNorm1d(outfeature),
|
| 150 |
+
nn.ReLU(True)
|
| 151 |
+
)
|
| 152 |
+
self.encoder1 = nn.Sequential(
|
| 153 |
+
nn.Linear(dim, 1024),
|
| 154 |
+
nn.BatchNorm1d(1024),
|
| 155 |
+
nn.ReLU(True),
|
| 156 |
+
nn.Dropout(0.2),
|
| 157 |
+
nn.Linear(1024, 1024),
|
| 158 |
+
nn.BatchNorm1d(1024),
|
| 159 |
+
nn.ReLU(True),
|
| 160 |
+
nn.Dropout(0.2),
|
| 161 |
+
nn.Linear(1024, 1024),
|
| 162 |
+
nn.BatchNorm1d(1024),
|
| 163 |
+
nn.ReLU(True),
|
| 164 |
+
nn.Dropout(0.2),
|
| 165 |
+
nn.Linear(1024, outfeature),
|
| 166 |
+
nn.BatchNorm1d(outfeature),
|
| 167 |
+
nn.ReLU(True)
|
| 168 |
+
)
|
| 169 |
+
|
| 170 |
+
# self.decoder0 = nn.Sequential(nn.Linear(outfeature, 1024), nn.ReLU(), nn.Dropout(0.2), nn.Linear(1024, 1024), nn.ReLU(),
|
| 171 |
+
# nn.Dropout(0.2), nn.Linear(1024, 1024), nn.ReLU(), nn.Dropout(0.2),
|
| 172 |
+
# nn.Linear(1024, dim))
|
| 173 |
+
# self.decoder1 = nn.Sequential(nn.Linear(outfeature, 1024), nn.ReLU(), nn.Dropout(0.2), nn.Linear(1024, 1024), nn.ReLU(),
|
| 174 |
+
# nn.Dropout(0.2), nn.Linear(1024, 1024), nn.ReLU(), nn.Dropout(0.2),
|
| 175 |
+
# nn.Linear(1024, dim))
|
| 176 |
+
def forward(self, x0, x1):
|
| 177 |
+
h0 = self.encoder0(x0.view(x0.size()[0], -1))
|
| 178 |
+
h1 = self.encoder1(x1.view(x1.size()[0], -1))
|
| 179 |
+
# z0 = self.decoder0(h0)
|
| 180 |
+
# z1 = self.decoder1(h1)
|
| 181 |
+
return h0, h1
|
run.py
ADDED
|
@@ -0,0 +1,310 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import argparse
|
| 2 |
+
import time
|
| 3 |
+
import random
|
| 4 |
+
from model import *
|
| 5 |
+
import math
|
| 6 |
+
import torch,gc
|
| 7 |
+
import torch.nn as nn
|
| 8 |
+
import torch.nn.functional as F
|
| 9 |
+
from train_methods import *
|
| 10 |
+
import logging
|
| 11 |
+
import sys
|
| 12 |
+
import numpy as np
|
| 13 |
+
import matplotlib.pyplot as plt
|
| 14 |
+
from Datasets import *
|
| 15 |
+
from config import *
|
| 16 |
+
from data_loader import *
|
| 17 |
+
import mat73
|
| 18 |
+
from anchors import *
|
| 19 |
+
from Cluster import *
|
| 20 |
+
parser = argparse.ArgumentParser(description='CAPIMAC in PyTorch')
|
| 21 |
+
parser.add_argument('--data', default='1', type=int,
|
| 22 |
+
help='choice of dataset, 0-HW,1-3Sources,2BBC,3-Scene15, 4-Caltech101,5-ORL_mtv,6-Caltech_7,7-Reuters,'
|
| 23 |
+
'8-20newsgroups,9-100leaves,10-BBC4,11-MSRCv1,12-BDGP,13-HandWritten,14-yale_mtv,15-Wikipedia-test,16-Movies,17-Prokaryotic,18-ALOI,19-flower17')
|
| 24 |
+
parser.add_argument('-bs', '--batch-size', default='1024', type=int, help='number of batch size')
|
| 25 |
+
parser.add_argument('-e', '--epochs', default='200', type=int, help='number of epochs to run')
|
| 26 |
+
parser.add_argument('-lr', '--learn-rate', default='0.0001', type=float, help='learning rate of adam')
|
| 27 |
+
parser.add_argument('-ap', '--aligned-prop', default='0.5', type=float,
|
| 28 |
+
help='originally aligned proportions in the partially view-aligned data')
|
| 29 |
+
parser.add_argument('--gpu', default=0, type=int, help='GPU device idx to use.')
|
| 30 |
+
parser.add_argument('-cp', '--complete-prop', default='0.5', type=float,
|
| 31 |
+
help='originally complete proportions in the partially sample-missing data')
|
| 32 |
+
parser.add_argument('-m', '--margin', default='5', type=int, help='initial margin')
|
| 33 |
+
parser.add_argument('-s', '--start-fine', default=True, type=bool, help='flag to start use robust loss or not')
|
| 34 |
+
parser.add_argument('-np', '--neg-num', default='30', type=int, help='the ratio of negative to positive pairs')
|
| 35 |
+
parser.add_argument('-noise', '--noisy-training', type=bool, default=True,
|
| 36 |
+
help='training with real labels or noisy labels')
|
| 37 |
+
parser.add_argument('-r', '--robust', default=1, type=int, help='use our robust loss or not')
|
| 38 |
+
|
| 39 |
+
dim=0
|
| 40 |
+
class NoiseRobustLoss(nn.Module):
|
| 41 |
+
def __init__(self):
|
| 42 |
+
super(NoiseRobustLoss, self).__init__()
|
| 43 |
+
|
| 44 |
+
def forward(self, pair_dist, P, margin, use_robust_loss, args):
|
| 45 |
+
# print(max(pair_dist))
|
| 46 |
+
dist_sq = pair_dist * pair_dist
|
| 47 |
+
P = P.to(torch.float32)
|
| 48 |
+
N = len(P)
|
| 49 |
+
if use_robust_loss == 1:
|
| 50 |
+
if args.start_fine:
|
| 51 |
+
loss = P * dist_sq + (1 - P) * (1 / margin) * torch.pow(
|
| 52 |
+
torch.clamp(torch.pow(pair_dist, 0.5) * (0.5*margin - pair_dist), min=0.0), 2)
|
| 53 |
+
else:
|
| 54 |
+
loss = P * dist_sq + (1 - P) * torch.pow(torch.clamp(margin - pair_dist, min=0.0), 2)
|
| 55 |
+
else:
|
| 56 |
+
loss = P * dist_sq + (1 - P) * torch.pow(torch.clamp(margin - pair_dist, min=0.0), 2)
|
| 57 |
+
loss = torch.sum(loss) / (2.0 * N)
|
| 58 |
+
return loss
|
| 59 |
+
def load_data(align_prop,complete_prop,neg_num,is_noise,dataset):
|
| 60 |
+
global dim
|
| 61 |
+
NetSeed = random.randint(1, 1000)
|
| 62 |
+
# NetSeed=72
|
| 63 |
+
print(NetSeed)
|
| 64 |
+
np.random.seed(NetSeed)
|
| 65 |
+
torch.backends.cudnn.deterministic = True
|
| 66 |
+
torch.manual_seed(NetSeed) # 为CPU设置随机种子
|
| 67 |
+
torch.cuda.manual_seed(NetSeed) # 为当前GPU设置随机种子
|
| 68 |
+
args = parser.parse_args()
|
| 69 |
+
all_data = []
|
| 70 |
+
map_pairs = []
|
| 71 |
+
label = []
|
| 72 |
+
train_pairs = []
|
| 73 |
+
|
| 74 |
+
if dataset=='Caltech101_7':
|
| 75 |
+
path = './datasets/' + dataset + '.mat' # 路径
|
| 76 |
+
mat = mat73.loadmat(path) # 加载mat文件
|
| 77 |
+
else:
|
| 78 |
+
mat = sio.loadmat('./datasets/' + dataset + '.mat')
|
| 79 |
+
if dataset == 'Scene15':
|
| 80 |
+
data = mat['X'][0][0:2] # 20, 59 dimensions
|
| 81 |
+
label = np.squeeze(mat['Y'])
|
| 82 |
+
elif dataset == 'HandWritten':
|
| 83 |
+
data = mat['X'][0][1:3]
|
| 84 |
+
label = np.squeeze(mat['Y'])
|
| 85 |
+
elif dataset == '3Sources':
|
| 86 |
+
data = mat['X'][0][0:2]
|
| 87 |
+
label = np.squeeze(mat['Y'])
|
| 88 |
+
elif dataset == 'ALOI':
|
| 89 |
+
data = mat['X'][0][0:2]
|
| 90 |
+
label = np.squeeze(mat['gt'])
|
| 91 |
+
elif dataset == 'BBCsports':
|
| 92 |
+
data = mat['X'][0][0:2]
|
| 93 |
+
label = np.squeeze(mat['Y'])
|
| 94 |
+
elif dataset == 'Caltech101':
|
| 95 |
+
data = mat['X'][0][0:2]
|
| 96 |
+
label = np.squeeze(mat['Y'])
|
| 97 |
+
elif dataset == 'Reuters_dim10':
|
| 98 |
+
data = [] # 18758 samples
|
| 99 |
+
data.append(normalize(np.vstack((mat['x_train'][0], mat['x_test'][0]))))
|
| 100 |
+
data.append(normalize(np.vstack((mat['x_train'][1], mat['x_test'][1]))))
|
| 101 |
+
label = np.squeeze(np.hstack((mat['y_train'], mat['y_test'])))
|
| 102 |
+
elif dataset == 'ORL_mtv':
|
| 103 |
+
data = mat['X'][0][0:2]
|
| 104 |
+
label = np.squeeze(mat['gt'])
|
| 105 |
+
elif dataset == 'Caltech101_7':
|
| 106 |
+
data = mat['data'][3:5]
|
| 107 |
+
data[0], data[1] = np.squeeze(data[0]), np.squeeze(data[1])
|
| 108 |
+
data[0], data[1] = np.array(data[0]), np.array(data[1])
|
| 109 |
+
label = np.squeeze(mat['labels'])
|
| 110 |
+
elif dataset == 'Reuters':
|
| 111 |
+
data = mat['X'][0][0:2]
|
| 112 |
+
label = np.squeeze(mat['Y'])
|
| 113 |
+
elif dataset == '20NewsGroups':
|
| 114 |
+
data = mat['data'][0][1:3]
|
| 115 |
+
label = np.squeeze(mat['truelabel'][0][0])
|
| 116 |
+
elif dataset == '100leaves':
|
| 117 |
+
mat['data'][0][0], mat['data'][0][1] = mat['data'][0][0].T, mat['data'][0][1].T
|
| 118 |
+
data = mat['data'][0][0:2]
|
| 119 |
+
label = np.squeeze(mat['truelabel'][0][0])
|
| 120 |
+
elif dataset == 'BBC4':
|
| 121 |
+
data = mat['data'][0][0:2]
|
| 122 |
+
label = np.squeeze(mat['truelabel'][0][0])
|
| 123 |
+
# print(label)
|
| 124 |
+
elif dataset == 'MSRCv1':
|
| 125 |
+
data = mat['X'][0][1:3]
|
| 126 |
+
label = np.squeeze(mat['Y'])
|
| 127 |
+
elif dataset == 'BDGP':
|
| 128 |
+
mat['X'][0][0], mat['X'][0][1] = mat['X'][0][0].T, mat['X'][0][1].T
|
| 129 |
+
data = mat['X'][0][0:2]
|
| 130 |
+
label = np.squeeze(mat['gt'])
|
| 131 |
+
elif dataset == 'HandWritten':
|
| 132 |
+
data = mat['X'][0][1:3]
|
| 133 |
+
label = np.squeeze(mat['Y'])
|
| 134 |
+
elif dataset == 'yale_mtv':
|
| 135 |
+
mat['X'][0][0], mat['X'][0][1] = mat['X'][0][0].T, mat['X'][0][1].T
|
| 136 |
+
data = mat['X'][0][0:2]
|
| 137 |
+
# print((data))
|
| 138 |
+
label = np.squeeze(mat['gt'])
|
| 139 |
+
elif dataset == 'Wikipedia-test':
|
| 140 |
+
data = mat['X'][0:2][0:2]
|
| 141 |
+
data = np.squeeze(data.T)
|
| 142 |
+
# print(data)
|
| 143 |
+
label = np.squeeze(mat['y'])
|
| 144 |
+
elif dataset == 'Movies':
|
| 145 |
+
data = mat['X'][0:2][0:2]
|
| 146 |
+
data = np.squeeze(data.T)
|
| 147 |
+
# print(data)
|
| 148 |
+
label = np.squeeze(mat['y'])
|
| 149 |
+
elif dataset == 'Prokaryotic':
|
| 150 |
+
value1 = mat['X'][0][0]
|
| 151 |
+
value2 = mat['X'][2][0]
|
| 152 |
+
data = [value1, value2]
|
| 153 |
+
# print(data)
|
| 154 |
+
label = np.squeeze(mat['y'])
|
| 155 |
+
elif dataset == 'flower17':
|
| 156 |
+
data = mat['X'][0][0:2]
|
| 157 |
+
label = np.squeeze(mat['Y'])
|
| 158 |
+
divide_seed = random.randint(1, 1000)
|
| 159 |
+
train_idx, test_idx = TT_split(len(label), 1 - align_prop, divide_seed)
|
| 160 |
+
train_label, test_label = label[train_idx], label[test_idx]
|
| 161 |
+
if dataset == 'Caltech101_7':
|
| 162 |
+
data[0], data[1] = np.squeeze(data[0]), np.squeeze(data[1])
|
| 163 |
+
print(np.shape(data[0]))
|
| 164 |
+
train_X, train_Y, test_X, test_Y = data[0][train_idx], data[1][train_idx], data[0][test_idx], data[1][test_idx]
|
| 165 |
+
'''获取对齐部分的潜在表示'''
|
| 166 |
+
map_pairs.append(train_X)
|
| 167 |
+
map_pairs.append(train_Y)
|
| 168 |
+
h0 , h1,epoch_time=pretrain(map_pairs, args)
|
| 169 |
+
all_label = np.concatenate((train_label, test_label))
|
| 170 |
+
'''获取初始训练数据和测试数据'''
|
| 171 |
+
if align_prop != 1:
|
| 172 |
+
shuffle_idx = random.sample(range(len(test_Y)), len(test_Y))
|
| 173 |
+
test_Y = test_Y[shuffle_idx]
|
| 174 |
+
test_label_X, test_label_Y = test_label, test_label[shuffle_idx]
|
| 175 |
+
elif align_prop == 1:
|
| 176 |
+
all_data.append(train_X.T)
|
| 177 |
+
all_data.append(train_Y.T)
|
| 178 |
+
'''不完整部分'''
|
| 179 |
+
test_mask = get_sn(2, len(test_label), 1 - complete_prop)
|
| 180 |
+
X_mask, Y_mask = test_mask[:, 0].astype(np.bool_), test_mask[:, 1].astype(np.bool_)
|
| 181 |
+
# test_X[~X_mask] = 0
|
| 182 |
+
# test_Y[~Y_mask] = 0
|
| 183 |
+
test_X, test_Y = test_X[X_mask], test_Y[Y_mask]
|
| 184 |
+
test_label_X, test_label_Y=test_label_X[X_mask], test_label_Y[Y_mask]
|
| 185 |
+
if align_prop != 1:
|
| 186 |
+
all_label_X = np.concatenate((train_label, test_label_X))
|
| 187 |
+
all_label_Y = np.concatenate((train_label, test_label_Y))
|
| 188 |
+
all_data.append(np.concatenate((train_X, test_X)).T)
|
| 189 |
+
all_data.append(np.concatenate((train_Y, test_Y)).T)
|
| 190 |
+
all_label = np.concatenate((train_label, test_label))
|
| 191 |
+
# all_label_X = test_label_X
|
| 192 |
+
# all_label_Y = test_label_Y
|
| 193 |
+
# all_data.append(test_X.T)
|
| 194 |
+
# all_data.append(test_Y.T)
|
| 195 |
+
# all_label = test_label
|
| 196 |
+
elif align_prop == 1:
|
| 197 |
+
all_label_X, all_label_Y = train_label, train_label
|
| 198 |
+
all_label = train_label
|
| 199 |
+
'''构建训练对'''
|
| 200 |
+
view0, view1, noisy_labels, real_labels, _, _ = get_pairs(train_X, train_Y, neg_num, train_label)
|
| 201 |
+
count = 0
|
| 202 |
+
for i in range(len(noisy_labels)):
|
| 203 |
+
if noisy_labels[i] != real_labels[i]:
|
| 204 |
+
count += 1
|
| 205 |
+
print('noise rate of the constructed neg. pairs is ', round(count / (len(noisy_labels) - len(train_X)), 2))
|
| 206 |
+
|
| 207 |
+
if is_noise == 0: # training with real_labels, v/t with real_labels
|
| 208 |
+
print("----------------------Training with real_labels----------------------")
|
| 209 |
+
train_pair_labels = real_labels
|
| 210 |
+
else: # training with labels, v/t with real_labels
|
| 211 |
+
print("----------------------Training with noisy_labels----------------------")
|
| 212 |
+
train_pair_labels = noisy_labels
|
| 213 |
+
'''初始化锚点'''
|
| 214 |
+
num_unique_labels = np.unique(all_label).shape[0]
|
| 215 |
+
|
| 216 |
+
anchors0,anchors1,len_indices=get_anchors(h0,h1,map_pairs,num_unique_labels)#h0是tensor
|
| 217 |
+
|
| 218 |
+
'''数据重表示'''
|
| 219 |
+
view0,view1,all_data[0],all_data[1]=torch.from_numpy(view0).float(),torch.from_numpy(view1).float(),torch.from_numpy(all_data[0]).float(),torch.from_numpy(all_data[1]).float()
|
| 220 |
+
|
| 221 |
+
view0, view1, all_data[0],all_data[1]=find_nanchor(anchors0,view0),find_nanchor(anchors1,view1),find_nanchor(anchors0,all_data[0].T),find_nanchor(anchors1,all_data[1].T)
|
| 222 |
+
#锚点数×样本数,增强锚点图
|
| 223 |
+
view0, view1, all_data[0], all_data[1]=np.array(view0),np.array(view1),np.array(all_data[0]),np.array(all_data[1])
|
| 224 |
+
print(np.shape(view0),'view0')
|
| 225 |
+
train_pairs.append(view0)
|
| 226 |
+
train_pairs.append(view1)
|
| 227 |
+
train_pair_real_labels = real_labels
|
| 228 |
+
dim=view0.shape[0]
|
| 229 |
+
return train_pairs, train_pair_labels, train_pair_real_labels, all_data, all_label, all_label_X, all_label_Y, dim,num_unique_labels,divide_seed
|
| 230 |
+
|
| 231 |
+
def normalize(x):
|
| 232 |
+
x = (x - np.tile(np.min(x, axis=0), (x.shape[0], 1))) / np.tile((np.max(x, axis=0) - np.min(x, axis=0)),
|
| 233 |
+
(x.shape[0], 1))
|
| 234 |
+
return x
|
| 235 |
+
def loader(train_bs, align_prop, complete_prop,neg_num, is_noise, dataset):
|
| 236 |
+
"""
|
| 237 |
+
:param train_bs: batch size for training, default is 1024
|
| 238 |
+
:param neg_prop: negative / positive pairs' ratio
|
| 239 |
+
:param test_prop: known aligned proportions for training MvCLN
|
| 240 |
+
:param is_noise: training with noisy labels or not, 0 --- not, 1 --- yes
|
| 241 |
+
:param data_idx: choice of dataset
|
| 242 |
+
:return: train_pair_loader including the constructed pos. and neg. pairs used for training MvCLN, all_loader including originally aligned and unaligned data used for testing MvCLN
|
| 243 |
+
"""
|
| 244 |
+
train_pairs, train_pair_labels, train_pair_real_labels, all_data, all_label, all_label_X, all_label_Y, dim,num_unique_labels,divide_seed\
|
| 245 |
+
= load_data(align_prop,complete_prop,neg_num,is_noise, dataset)
|
| 246 |
+
train_pair_dataset = GetDataset(train_pairs, train_pair_labels, train_pair_real_labels)
|
| 247 |
+
|
| 248 |
+
train_pair_loader = DataLoader(
|
| 249 |
+
train_pair_dataset,
|
| 250 |
+
batch_size=train_bs,
|
| 251 |
+
shuffle=True,
|
| 252 |
+
drop_last=True
|
| 253 |
+
)
|
| 254 |
+
return train_pair_loader, all_data, all_label, all_label_X, all_label_Y, dim,num_unique_labels,divide_seed
|
| 255 |
+
|
| 256 |
+
if __name__ == '__main__':
|
| 257 |
+
for i in range(1):
|
| 258 |
+
args = parser.parse_args()
|
| 259 |
+
data_name = ['HandWritten', '3Sources', 'BBCsports', 'Scene15', 'Caltech101', 'ORL_mtv', 'Caltech101_7', 'Reuters',
|
| 260 |
+
'20NewsGroups','100leaves','BBC4','MSRCv1','BDGP','HandWritten','yale_mtv','Wikipedia-test','Movies','Prokaryotic','ALOI','flower17']
|
| 261 |
+
train_pair_loader, all_data, all_label, all_label_X, all_label_Y, dim, outfeature ,divide_seed=loader(args.batch_size, args.aligned_prop,args.complete_prop,args.neg_num,args.noisy_training,data_name[args.data])
|
| 262 |
+
|
| 263 |
+
model = Anchormodel(dim,outfeature).to(args.gpu)
|
| 264 |
+
criterion = NoiseRobustLoss().to(args.gpu)
|
| 265 |
+
# criterion_mse = nn.MSELoss().to(args.gpu)
|
| 266 |
+
optimizer = torch.optim.Adam(model.parameters(), lr=args.learn_rate)
|
| 267 |
+
CAR_list = []
|
| 268 |
+
acc_list, nmi_list, ari_list,f_list,f1_list,pre_list,pre2_list,rec_list,pur_list = [], [], [],[], [], [],[], [], []
|
| 269 |
+
train_time = 0
|
| 270 |
+
all_data[0], all_data[1]=torch.from_numpy(all_data[0]), torch.from_numpy(all_data[1])
|
| 271 |
+
for i in range(0, args.epochs + 1):
|
| 272 |
+
if i == 0:
|
| 273 |
+
with torch.no_grad():
|
| 274 |
+
epoch_time = train2(train_pair_loader, model, criterion, optimizer, i, args)
|
| 275 |
+
else:
|
| 276 |
+
epoch_time = train2(train_pair_loader, model, criterion, optimizer, i, args)
|
| 277 |
+
# test
|
| 278 |
+
v0, v1, pred_label, alignment_rate = tiny_infer(model, args.gpu, all_data, all_label_X, all_label_Y)
|
| 279 |
+
CAR_list.append(alignment_rate)
|
| 280 |
+
data = []
|
| 281 |
+
data.append(v0)
|
| 282 |
+
data.append(v1)
|
| 283 |
+
|
| 284 |
+
y_pred, ret, accuracy, nmi, ari, f_score, f_score2, precision, precision2, recall, purity = Clustering(data,
|
| 285 |
+
pred_label)
|
| 286 |
+
if i % 10 == 0:
|
| 287 |
+
print(accuracy, nmi, ari, f_score, f_score2, precision, precision2, recall, purity)
|
| 288 |
+
# logging.info("******** testing ********")
|
| 289 |
+
# logging.info(
|
| 290 |
+
# "CAR={} kmeans: acc={} nmi={} ari={}".format(round(alignment_rate, 4), ret['kmeans']['accuracy'],
|
| 291 |
+
# ret['kmeans']['NMI'], ret['kmeans']['ARI']))
|
| 292 |
+
acc_list.append(ret['kmeans']['ACC'])
|
| 293 |
+
nmi_list.append(ret['kmeans']['NMI'])
|
| 294 |
+
ari_list.append(ret['kmeans']['ARI'])
|
| 295 |
+
f_list.append(ret['kmeans']['F1'])
|
| 296 |
+
f1_list.append(ret['kmeans']['F2'])
|
| 297 |
+
pre_list.append(ret['kmeans']['PRE'])
|
| 298 |
+
pre2_list.append(ret['kmeans']['PRE2'])
|
| 299 |
+
rec_list.append(ret['kmeans']['REC'])
|
| 300 |
+
pur_list.append(ret['kmeans']['PUR'])
|
| 301 |
+
print('ACC:', max(acc_list))
|
| 302 |
+
print("NMI:", max(nmi_list))
|
| 303 |
+
print("ARI:", max(ari_list))
|
| 304 |
+
print("F1:", max(f_list))
|
| 305 |
+
print("F2:", max(f1_list))
|
| 306 |
+
print("PRE:", max(pre_list))
|
| 307 |
+
print("PRE2:", max(pre2_list))
|
| 308 |
+
print("REC:", max(rec_list))
|
| 309 |
+
print("PUR:", max(pur_list))
|
| 310 |
+
logging.info('******** End, training time = {} s ********'.format(round(train_time, 2)))
|
sample_kernal.py
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
| 3 |
+
|
| 4 |
+
# 高斯核函数
|
| 5 |
+
def gaussian_kernel(x, x_i, bandwidth):
|
| 6 |
+
return np.exp(-0.5 * ((x - x_i) / bandwidth) ** 2)
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
# 核回归插值函数(支持多维)
|
| 10 |
+
def kernel_regression_multi_dim(x_known, y_known, x_targets, bandwidth):
|
| 11 |
+
"""
|
| 12 |
+
x_known: 已知点的 x 坐标 (1D array)
|
| 13 |
+
y_known: 已知点的 y 值,多维数组 (2D array, shape: [n_samples, n_features])
|
| 14 |
+
x_target: 需要插值的 x 坐标 (scalar)
|
| 15 |
+
bandwidth: 核函数的带宽参数
|
| 16 |
+
"""
|
| 17 |
+
# 计算核权重
|
| 18 |
+
y_targets = [] # 存储每个目标点的插值结果
|
| 19 |
+
|
| 20 |
+
for x_target in x_targets:
|
| 21 |
+
# 计算核权重
|
| 22 |
+
weights = np.array([gaussian_kernel(x_target, x_i, bandwidth) for x_i in x_known])
|
| 23 |
+
weights /= weights.sum() # 权重归一化
|
| 24 |
+
|
| 25 |
+
# 对每个维度分别插值
|
| 26 |
+
y_target = np.sum(weights[:, np.newaxis] * y_known, axis=0)
|
| 27 |
+
y_targets.append(y_target)
|
| 28 |
+
|
| 29 |
+
return np.array(y_targets)
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def insert_and_sort(x_known, y_known, x_targets, y_targets):
|
| 33 |
+
# 合并数据
|
| 34 |
+
# print(np.shape(y_known))
|
| 35 |
+
# print(np.shape(y_targets))
|
| 36 |
+
x_combined = np.concatenate((x_known, x_targets))
|
| 37 |
+
y_combined = np.vstack((y_known, y_targets))
|
| 38 |
+
|
| 39 |
+
# 按 x_combined 排序
|
| 40 |
+
sorted_indices = np.argsort(x_combined)
|
| 41 |
+
x_known_sorted = x_combined[sorted_indices]
|
| 42 |
+
y_known_sorted = y_combined[sorted_indices]
|
| 43 |
+
|
| 44 |
+
return x_known_sorted, y_known_sorted
|
train_methods.py
ADDED
|
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
|
| 3 |
+
from model import SdA
|
| 4 |
+
from config import *
|
| 5 |
+
import torch.nn as nn
|
| 6 |
+
import torch
|
| 7 |
+
import time
|
| 8 |
+
import logging
|
| 9 |
+
import torch.nn.functional as F
|
| 10 |
+
def train1(train_pairs, model, criterion, optimizer, epoch, args):
|
| 11 |
+
if epoch % 10 == 0:
|
| 12 |
+
logging.info("=======> Train epoch: {}/{}".format(epoch, args.epochs))
|
| 13 |
+
model.train()
|
| 14 |
+
time0 = time.time()
|
| 15 |
+
loss_value = 0
|
| 16 |
+
x0,x1=torch.from_numpy(train_pairs[0]).float(),torch.from_numpy(train_pairs[1]).float()
|
| 17 |
+
x0, x1 = x0.to(args.gpu), x1.to(args.gpu)
|
| 18 |
+
# print(np.shape(x0))
|
| 19 |
+
try:
|
| 20 |
+
h0, h1, d0, d1 = model(x0, x1)
|
| 21 |
+
except:
|
| 22 |
+
print("error raise in batch",epoch)
|
| 23 |
+
#
|
| 24 |
+
# x0, x1 = torch.squeeze(x0), torch.squeeze(x1)
|
| 25 |
+
loss = criterion(x0, d0)
|
| 26 |
+
loss += criterion(x1, d1)
|
| 27 |
+
loss += model.regularization_loss()#l2正则化
|
| 28 |
+
loss_value += loss.item()
|
| 29 |
+
if epoch != 0:
|
| 30 |
+
optimizer.zero_grad()
|
| 31 |
+
loss.backward()
|
| 32 |
+
optimizer.step()
|
| 33 |
+
epoch_time = time.time() - time0
|
| 34 |
+
|
| 35 |
+
return h0 , h1,epoch_time
|
| 36 |
+
def pretrain(train_pairs, args):
|
| 37 |
+
model = SdA(config).to(args.gpu)
|
| 38 |
+
criterion = nn.MSELoss().to(args.gpu)
|
| 39 |
+
optimizer = torch.optim.Adam(model.parameters(), lr=args.learn_rate)
|
| 40 |
+
# 'train'
|
| 41 |
+
for i in range(0, args.epochs + 1):
|
| 42 |
+
if i == 0:
|
| 43 |
+
with torch.no_grad():
|
| 44 |
+
h0, h1, epoch_time = train1(train_pairs, model, criterion, optimizer, i, args)
|
| 45 |
+
else:
|
| 46 |
+
h0, h1, epoch_time = train1(train_pairs, model, criterion, optimizer, i, args)
|
| 47 |
+
return h0, h1, epoch_time
|
| 48 |
+
|
| 49 |
+
def train2(train_loader, model, criterion,optimizer, epoch, args):
|
| 50 |
+
model.train()
|
| 51 |
+
time0 = time.time()
|
| 52 |
+
loss_value = 0
|
| 53 |
+
for batch_idx, (x0, x1, labels, real_labels) in enumerate(train_loader):
|
| 54 |
+
# labels refer to noisy labels for the constructed pairs, while real_labels are the clean labels for these pairs
|
| 55 |
+
x0, x1, labels, real_labels = x0.to(args.gpu), x1.to(args.gpu), labels.to(args.gpu), real_labels.to(args.gpu)
|
| 56 |
+
print(np.shape(x0))
|
| 57 |
+
try:
|
| 58 |
+
h0, h1 = model(x0.view(x0.size()[0], -1), x1.view(x1.size()[0], -1))
|
| 59 |
+
except:
|
| 60 |
+
print("error raise in batch", batch_idx)
|
| 61 |
+
|
| 62 |
+
pair_dist = F.pairwise_distance(h0, h1)
|
| 63 |
+
|
| 64 |
+
loss = criterion(pair_dist, labels, args.margin, args.robust, args)
|
| 65 |
+
# loss1=criterion_mse(z0, z1)
|
| 66 |
+
# print(loss1,'loss')
|
| 67 |
+
loss_value += loss.item()
|
| 68 |
+
if epoch != 0:
|
| 69 |
+
optimizer.zero_grad()
|
| 70 |
+
loss.backward()
|
| 71 |
+
optimizer.step()
|
| 72 |
+
epoch_time = time.time() - time0
|
| 73 |
+
return epoch_time
|
utils.py
ADDED
|
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
import random
|
| 3 |
+
from sklearn.preprocessing import OneHotEncoder
|
| 4 |
+
from numpy.random import randint
|
| 5 |
+
import math
|
| 6 |
+
import torch
|
| 7 |
+
def TT_split(n_all, test_prop, seed):
|
| 8 |
+
'''
|
| 9 |
+
split data into training, testing dataset
|
| 10 |
+
'''
|
| 11 |
+
random.seed(seed)
|
| 12 |
+
random_idx = random.sample(range(n_all), n_all)
|
| 13 |
+
train_num = np.ceil((1-test_prop) * n_all).astype(int)
|
| 14 |
+
train_idx = random_idx[0:train_num]
|
| 15 |
+
test_num = np.floor(test_prop * n_all).astype(int)
|
| 16 |
+
test_idx = random_idx[-test_num:]
|
| 17 |
+
return train_idx, test_idx
|
| 18 |
+
|
| 19 |
+
def get_sn(view_num, alldata_len, missing_rate):
|
| 20 |
+
"""Randomly generate incomplete data information, simulate partial view data with complete view data
|
| 21 |
+
:param view_num:view number
|
| 22 |
+
:param alldata_len:number of samples
|
| 23 |
+
:param missing_rate:Defined in section 4.3 of the paper
|
| 24 |
+
:return:Sn
|
| 25 |
+
"""
|
| 26 |
+
missing_rate = missing_rate / 2
|
| 27 |
+
one_rate = 1.0 - missing_rate
|
| 28 |
+
if one_rate <= (1 / view_num):
|
| 29 |
+
enc = OneHotEncoder() # n_values=view_num
|
| 30 |
+
view_preserve = enc.fit_transform(randint(0, view_num, size=(alldata_len, 1))).toarray()
|
| 31 |
+
return view_preserve
|
| 32 |
+
error = 1
|
| 33 |
+
if one_rate == 1:
|
| 34 |
+
matrix = randint(1, 2, size=(alldata_len, view_num))
|
| 35 |
+
return matrix
|
| 36 |
+
max_iterations = 200 # 设置最大循环次数
|
| 37 |
+
iterations = 0 # 初始化循环次数
|
| 38 |
+
|
| 39 |
+
while error >= 0.005 and iterations < max_iterations:
|
| 40 |
+
enc = OneHotEncoder() # n_values=view_num
|
| 41 |
+
view_preserve = enc.fit_transform(randint(0, view_num, size=(alldata_len, 1))).toarray()#生成一个len^view的矩阵,矩阵每一行只有一个1
|
| 42 |
+
one_num = view_num * alldata_len * one_rate - alldata_len
|
| 43 |
+
ratio = one_num / (view_num * alldata_len)#0.25
|
| 44 |
+
matrix_iter = (randint(0, 100, size=(alldata_len, view_num)) < int(ratio * 100)).astype(int)
|
| 45 |
+
a = np.sum(((matrix_iter + view_preserve) > 1).astype(int))
|
| 46 |
+
|
| 47 |
+
one_num_iter = one_num / (1 - a / one_num)
|
| 48 |
+
ratio = one_num_iter / (view_num * alldata_len)
|
| 49 |
+
matrix_iter = (randint(0, 100, size=(alldata_len, view_num)) < int(ratio * 100)).astype(int)
|
| 50 |
+
matrix = ((matrix_iter + view_preserve) > 0).astype(int)
|
| 51 |
+
ratio = np.sum(matrix) / (view_num * alldata_len)
|
| 52 |
+
error = abs(one_rate - ratio)
|
| 53 |
+
iterations=iterations+1
|
| 54 |
+
return matrix
|
| 55 |
+
|
| 56 |
+
def cosineSimilartydis(A,B):
|
| 57 |
+
A=A/(torch.norm(A,dim=1,p=2,keepdim=True)+0.000001)
|
| 58 |
+
B=B/(torch.norm(B,dim=1,p=2,keepdim=True)+0.000001)
|
| 59 |
+
|
| 60 |
+
W=torch.mm(A,B.t())
|
| 61 |
+
max_values, _ = torch.max(W, axis=0)
|
| 62 |
+
min_values, _ = torch.min(W, axis=0)
|
| 63 |
+
denominator = max_values - min_values
|
| 64 |
+
denominator = torch.clamp(denominator, min=1e-6)
|
| 65 |
+
normalized_matrix = (W - min_values) / denominator
|
| 66 |
+
return 1-normalized_matrix
|
| 67 |
+
|
| 68 |
+
def find_nanchor(A,B):
|
| 69 |
+
print(A.device)
|
| 70 |
+
W=cosineSimilartydis(A, B)#表示距离
|
| 71 |
+
n = math.ceil(W.shape[0]/19)
|
| 72 |
+
# print(n)
|
| 73 |
+
# 复制矩阵A以避免修改原始矩阵
|
| 74 |
+
modified_matrix_A = W.clone()
|
| 75 |
+
print(modified_matrix_A.device,'de')
|
| 76 |
+
for col in range(modified_matrix_A.shape[1]):
|
| 77 |
+
min_indices = np.argpartition(modified_matrix_A[:, col], n)[:n]
|
| 78 |
+
modified_matrix_A[min_indices, col] = 0
|
| 79 |
+
|
| 80 |
+
return modified_matrix_A
|