Upload 13 files

Browse files

Files changed (14) hide show

.gitattributes +2 -0
3Sources.mat +3 -0
Cluster.py +284 -0
Datasets.py +39 -0
Prokaryotic.mat +3 -0
SRW_KNN_greedy.py +219 -0
anchors.py +22 -0
config.py +85 -0
data_loader.py +37 -0
model.py +181 -0
run.py +310 -0
sample_kernal.py +44 -0
train_methods.py +73 -0
utils.py +80 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+3Sources.mat filter=lfs diff=lfs merge=lfs -text
+Prokaryotic.mat filter=lfs diff=lfs merge=lfs -text

3Sources.mat ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0b56f4c3441fbaa0dc0281897851722c122bb00b24abdef15ff7e4c88dace833
+size 112113

Cluster.py ADDED Viewed

	@@ -0,0 +1,284 @@

+import torch
+import numpy as np
+import torch.nn
+from utils import cosineSimilartydis
+from sklearn import metrics
+import sklearn.metrics as metrics
+from sklearn.cluster import KMeans
+from munkres import Munkres
+import sys
+import logging
+from sample_kernal import *
+def tiny_infer(model, device, all_data, all_label_X, all_label_Y):
+    model.eval()
+    align_out0 = []
+    align_out1 = []
+    sort_value=[]
+    class_labels_cluster = []
+    len_alldata0 = all_data[0].shape[1]
+    len_alldata1 = all_data[1].shape[1]
+    # print(len_alldata0)
+    # print(len_alldata1)
+    len_map=max(len_alldata0, len_alldata1)
+    align_labels = torch.zeros(len_map)
+    if len_alldata0 > len_alldata1:
+        labels = all_label_Y
+        long_labels=all_label_X
+        test_num = len_alldata1
+        long_num= len_alldata0
+    else:
+        labels = all_label_X
+        long_labels = all_label_Y
+        test_num = len_alldata0
+        long_num = len_alldata1
+    labels = torch.from_numpy(labels)
+    with torch.no_grad():
+        x0, x1, labels = all_data[0].to(device), all_data[1].to(device), labels.to(device)
+        x0 = x0.view(x0.size()[0], -1).T
+        x1 = x1.view(x1.size()[0], -1).T
+        h0, h1 = model(x0, x1)
+        if len_alldata0 > len_alldata1:
+            C = cosineSimilartydis(h0, h1).T
+            C_temp=C.clone()
+            for i in range(test_num):
+                idx = torch.argsort(C[i, :])
+                sort_value.append(C_temp[i, idx[0]])
+                C[:, idx[0]] = float("inf")
+                align_out0.append((h1[i, :].cpu()).numpy())
+                align_out1.append((h0[idx[0], :].cpu()).numpy())#它和align0维度一样变小了
+                # if all_label_Y[i] == all_label_X[idx[0]]:
+                #     align_labels[i] = 1
+        else:
+            C = cosineSimilartydis(h0, h1)
+            C_temp = C.clone()
+            for i in range(test_num):
+                idx = torch.argsort(C[i, :])
+                sort_value.append(C_temp[i, idx[0]])
+                C[:, idx[0]] = float("inf")
+                align_out0.append((h0[i, :].cpu()).numpy())
+                align_out1.append((h1[idx[0], :].cpu()).numpy())
+                # if all_label_X[i] == all_label_Y[idx[0]]:
+                #     align_labels[i] = 1
+        sort_value,align_out0=torch.tensor(sort_value),torch.tensor(align_out0)
+        sorted_list, sorted_indice0 = torch.sort(sort_value)
+        sorted_indice0 = sorted_indice0.to(torch.long)
+        # 使用排序后的索引重新排列 align_out0
+        sorted_align0 = align_out0[sorted_indice0]
+        # 计算相邻元素之间的差值
+        differences = sorted_list[1:] - sorted_list[:-1]
+        x_known = np.arange(len(sorted_list))
+        # 生成索引
+        index_pairs = [(i, i + 1) for i in range(len(sorted_list) - 1)]
+        Xn=long_num-test_num
+        top_values, top_indices = torch.topk(differences, Xn)
+        # 使用 top_indices 获取对应的索引对
+        top_index_pairs = [index_pairs[i.item()] for i in top_indices]
+        average_indices = [(i + j) / 2 for i, j in top_index_pairs]
+        average_indices=np.array(average_indices)
+        bandwidth=1.0
+        index_pairs=np.array(index_pairs)
+        sorted_align0=np.array(sorted_align0)
+        A3_initial = kernel_regression_multi_dim(x_known, sorted_align0, average_indices, bandwidth)
+        x_known_sorted, y_sorted_align0 = insert_and_sort(x_known, sorted_align0, average_indices, A3_initial)
+        alignre0,alignre1=[],[]
+        y_sorted_align0=torch.tensor(y_sorted_align0).to('cuda')
+        y_sorted_align0=y_sorted_align0.float()
+        if len_alldata0 > len_alldata1:
+            Cre = cosineSimilartydis(h0,y_sorted_align0)
+            for i in range(long_num):
+                idx0 = torch.argsort(Cre[i, :])
+                Cre[:, idx0[0]] = float("inf")
+                alignre0.append((h0[i, :].cpu()).numpy())
+                alignre1.append((y_sorted_align0[idx[0], :].cpu()).numpy())
+                if all_label_X[i] == all_label_Y[idx[0]]:
+                    align_labels[i] = 1
+        else:
+            Cre = cosineSimilartydis(h1,y_sorted_align0)
+            for i in range(long_num):
+                idx1 = torch.argsort(Cre[i, :])
+                Cre[:, idx1[0]] = float("inf")
+                alignre0.append((h1[i, :].cpu()).numpy())
+                alignre1.append((y_sorted_align0[idx[0], :].cpu()).numpy())
+                if all_label_Y[i] == all_label_X[idx[0]]:
+                    align_labels[i] = 1
+        class_labels_cluster.extend(labels.cpu().numpy())
+#
+    count = torch.sum(align_labels)
+    # print(test_num,'testnum')
+    inference_acc = count.item() / test_num
+    print(inference_acc)
+    print(np.shape(align_out1))
+    return np.array(alignre0), np.array(alignre1), np.array(class_labels_cluster), inference_acc
+    # return np.array(align_out0), np.array(align_out1), np.array(class_labels_cluster), inference_acc
+def Clustering(x_list, y):
+    # logging.info('******** Clustering ********')
+    n_clusters = np.size(np.unique(y))
+    # np.random.seed(1)
+    x_final_concat = np.concatenate(x_list[:], axis=1)
+    kmeans_assignments, km = get_cluster_sols(x_final_concat, ClusterClass=KMeans, n_clusters=n_clusters,
+                                              init_args={'n_init': 10})
+    y_preds = get_y_preds(y, kmeans_assignments, n_clusters)
+    if np.min(y) == 1:
+        y = y - 1
+    scores, _ ,accuracy,nmi,ari,f_score,f_score2,precision,precision2,recall,purity= clustering_metric(y, kmeans_assignments, n_clusters)
+    ret = {}
+    ret['kmeans'] = scores
+    return y_preds, ret,accuracy,nmi,ari,f_score,f_score2,precision,precision2,recall,purity
+def get_y_preds(y_true, cluster_assignments, n_clusters):
+    '''
+    Computes the predicted labels, where label assignments now
+    correspond to the actual labels in y_true (as estimated by Munkres)
+    cluster_assignments:    array of labels, outputted by kmeans
+    y_true:                 true labels
+    n_clusters:             number of clusters in the dataset
+    returns:    a tuple containing the accuracy and confusion matrix,
+                in that order
+    '''
+    confusion_matrix = metrics.confusion_matrix(y_true, cluster_assignments, labels=None)
+    # compute accuracy based on optimal 1:1 assignment of clusters to labels
+    cost_matrix = calculate_cost_matrix(confusion_matrix, n_clusters)
+    indices = Munkres().compute(cost_matrix)
+    kmeans_to_true_cluster_labels = get_cluster_labels_from_indices(indices)
+    if np.min(cluster_assignments) != 0:
+        cluster_assignments = cluster_assignments - np.min(cluster_assignments)
+    y_pred = kmeans_to_true_cluster_labels[cluster_assignments]
+    return y_pred
+def get_cluster_sols(x, cluster_obj=None, ClusterClass=None, n_clusters=None, init_args={}):
+    '''
+    Using either a newly instantiated ClusterClass or a provided
+    cluster_obj, generates cluster assignments based on input data
+    x:              the points with which to perform clustering
+    cluster_obj:    a pre-fitted instance of a clustering class
+    ClusterClass:   a reference to the sklearn clustering class, necessary
+                    if instantiating a new clustering class
+    n_clusters:     number of clusters in the dataset, necessary
+                    if instantiating new clustering class
+    init_args:      any initialization arguments passed to ClusterClass
+    returns:    a tuple containing the label assignments and the clustering object
+    '''
+    # if provided_cluster_obj is None, we must have both ClusterClass and n_clusters
+    assert not (cluster_obj is None and (ClusterClass is None or n_clusters is None))
+    cluster_assignments = None
+    if cluster_obj is None:
+        cluster_obj = ClusterClass(n_clusters, **init_args)
+        for _ in range(10):
+            try:
+                cluster_obj.fit(x)
+                break
+            except:
+                print("Unexpected error:", sys.exc_info())
+        else:
+            return np.zeros((len(x),)), cluster_obj
+    cluster_assignments = cluster_obj.predict(x)
+    return cluster_assignments, cluster_obj
+def calculate_cost_matrix(C, n_clusters):
+    cost_matrix = np.zeros((n_clusters, n_clusters))
+    # cost_matrix[i,j] will be the cost of assigning cluster i to label j
+    for j in range(n_clusters):
+        s = np.sum(C[:, j])  # number of examples in cluster i
+        for i in range(n_clusters):
+            t = C[i, j]
+            cost_matrix[j, i] = s - t
+    return cost_matrix
+def get_cluster_labels_from_indices(indices):
+    n_clusters = len(indices)
+    clusterLabels = np.zeros(n_clusters)
+    for i in range(n_clusters):
+        clusterLabels[i] = indices[i][1]
+    return clusterLabels
+def clustering_metric(y_true, y_pred, n_clusters, verbose=False, decimals=4):
+    y_pred_ajusted = get_y_preds(y_true, y_pred, n_clusters)
+    classification_metrics, confusion_matrix = classification_metric(y_true, y_pred_ajusted)
+    accuracy = metrics.accuracy_score(y_true, y_pred_ajusted)
+    accuracy = np.round(accuracy, decimals)
+    # AMI
+    ami = metrics.adjusted_mutual_info_score(y_true, y_pred_ajusted)
+    ami = np.round(ami, decimals)
+    # NMI
+    nmi = metrics.normalized_mutual_info_score(y_true, y_pred_ajusted)
+    nmi = np.round(nmi, decimals)
+    # ARI
+    ari = metrics.adjusted_rand_score(y_true, y_pred_ajusted)
+    ari = np.round(ari, decimals)
+    #fscore
+    f_score = metrics.f1_score(y_true, y_pred_ajusted, average='macro')
+    f_score = np.round(f_score, decimals)
+    f_score2 = metrics.f1_score(y_true, y_pred_ajusted, average='weighted')
+    f_score2 = np.round(f_score2, decimals)
+    # precision
+    precision = metrics.precision_score(y_true, y_pred_ajusted, average='macro')
+    precision = np.round(precision, decimals)
+    precision2 = metrics.precision_score(y_true, y_pred_ajusted, average='weighted')
+    precision2 = np.round(precision2, decimals)
+    # recall
+    recall = metrics.recall_score(y_true, y_pred_ajusted, average='macro')
+    recall = np.round(recall, decimals)
+    # Purity
+    purity = Purity(y_true, y_pred_ajusted)
+    purity = np.round(purity, decimals)
+    # print(accuracy,nmi,ari,f_score,f_score2,precision,precision2,recall,purity,"zb")
+    # if verbose:
+    #     logging.info('AMI: {}, NMI: {}, ARI: {}'.format(ami, nmi, ari))
+    # return dict({'AMI': ami, 'NMI': nmi, 'ARI': ari}, **classification_metrics), confusion_matrix,accuracy,nmi,ari,f_score,f_score2,precision,precision2,recall,purity
+    return dict({'ACC': accuracy,'AMI': ami, 'NMI': nmi, 'ARI': ari, 'F1': f_score, 'F2': f_score2, 'PRE': precision, 'PRE2': precision2, 'REC': recall, 'PUR': purity}), confusion_matrix, accuracy, nmi, ari, f_score, f_score2, precision, precision2, recall, purity
+def Purity(y_true, y_pred):
+    y_voted_labels = np.zeros(y_true.shape)
+    labels = np.unique(y_true)
+    ordered_labels = np.arange(labels.shape[0])
+    for k in range(labels.shape[0]):
+        y_true[y_true == labels[k]] = ordered_labels[k]
+    labels = np.unique(y_true)
+    bins = np.concatenate((labels, [np.max(labels) + 1]), axis=0)
+    for cluster in np.unique(y_pred):
+        hist, _ = np.histogram(y_true[y_pred == cluster], bins=bins)
+        winner = np.argmax(hist)
+        y_voted_labels[y_pred == cluster] = winner
+    return metrics.accuracy_score(y_true, y_voted_labels)
+def classification_metric(y_true, y_pred, average='macro', verbose=False, decimals=4):
+    # confusion matrix
+    confusion_matrix = metrics.confusion_matrix(y_true, y_pred)
+    # ACC
+    accuracy = metrics.accuracy_score(y_true, y_pred)
+    accuracy = np.round(accuracy, decimals)
+    # precision
+    precision = metrics.precision_score(y_true, y_pred, average=average)
+    precision = np.round(precision, decimals)
+    # recall
+    recall = metrics.recall_score(y_true, y_pred, average=average)
+    recall = np.round(recall, decimals)
+    # F-score
+    f_score = metrics.f1_score(y_true, y_pred, average=average)
+    f_score = np.round(f_score, decimals)
+    if verbose:
+        # print('Confusion Matrix')
+        # print(confusion_matrix)
+        logging.info('accuracy: {}, precision: {}, recall: {}, f_measure: {}'.format(accuracy, precision, recall, f_score))
+    return {'accuracy': accuracy, 'precision': precision, 'recall': recall, 'f_measure': f_score}, confusion_matrix

Datasets.py ADDED Viewed

	@@ -0,0 +1,39 @@

+from torch.utils.data import Dataset, DataLoader
+import numpy as np
+import torch
+class GetDataset(Dataset):
+    def __init__(self, data, labels, real_labels):
+        self.data = data
+        self.labels = labels
+        self.real_labels = real_labels
+    def __getitem__(self, index):
+        fea0, fea1 = torch.from_numpy(self.data[0][:, index]).float(), torch.from_numpy(self.data[1][:, index]).float()
+        fea0, fea1 = fea0.unsqueeze(0), fea1.unsqueeze(0)
+        label = np.int64(self.labels[index])
+        if len(self.real_labels) == 0:
+            return fea0, fea1, label
+        real_label = np.int64(self.real_labels[index])
+        return fea0, fea1, label, real_label
+    def __len__(self):
+        return len(self.labels)
+class GetAllDataset(Dataset):
+    def __init__(self, data, labels, class_labels0, class_labels1):
+        self.data = data
+        self.labels = labels
+        self.class_labels0 = class_labels0
+        self.class_labels1 = class_labels1
+    def __getitem__(self, index):
+        fea0, fea1 = torch.from_numpy(self.data[0][:, index]).float(), torch.from_numpy(self.data[1][:, index]).float()
+        fea0, fea1 = fea0.unsqueeze(0), fea1.unsqueeze(0)
+        label = np.int64(self.labels[index])
+        class_labels0 = np.int64(self.class_labels0[index])
+        class_labels1 = np.int64(self.class_labels1[index])
+        return fea0, fea1, label, class_labels0, class_labels1
+    def __len__(self):
+        return len(self.labels)

Prokaryotic.mat ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:20471598a8c819fb35e94f102cda4300a3b4f2cd185d3cbe0ea06e0349a7ed7c
+size 3105301

SRW_KNN_greedy.py ADDED Viewed

	@@ -0,0 +1,219 @@

+import numpy as np
+import torch
+import umap
+import matplotlib.pyplot as plt
+from run import *
+# 固定随机数种子，生成100个二维数据点
+# torch.manual_seed(99)
+def para(data,num_nodes,num_class):
+    similarity_threshold = 0.4  # 相似度阈值
+    num_anchors =  num_class*2# 锚点数量
+    # num_anchors =26
+    distances = cosineSimilartydis(data, data)
+    # 排除对角线上的自身距离（0）的平均值
+    mean_distance = distances[~torch.eye(distances.size(0), dtype=torch.bool)].mean()
+    coverage_radius=mean_distance*0.3 # 贪心覆盖算法中的覆盖半径
+    #到时候写一个对齐数据少于锚点数量error的提示
+    if num_nodes < 100:  # 小图
+        num_walks,walk_length = 20,3
+    elif num_nodes < 1000:  # 中型图
+        num_walks,walk_length = 10,5
+    elif num_nodes < 10000:  # 大型图
+        num_walks,walk_length = 5,10
+    else:  # 超大图
+        num_walks,walk_length = 3,20
+    return num_walks,walk_length,similarity_threshold,num_anchors,coverage_radius
+def cosineSimilarty(A,B):
+    A=A/(torch.norm(A,dim=1,p=2,keepdim=True)+0.000001)
+    # A2 = A / (torch.norm(A, dim=0, p=2, keepdim=True) + 0.000001)
+    B=B/(torch.norm(B,dim=1,p=2,keepdim=True)+0.000001)
+    # B2 = B / (torch.norm(B, dim=0, p=2, keepdim=True) + 0.000001)
+    W=torch.mm(A,B.t())
+    max_values,_ = torch.max(W, axis=0)
+    min_values,_ = torch.min(W, axis=0)
+    normalized_matrix = (W - min_values) / (max_values - min_values)
+    normalized_matrix = torch.nan_to_num(normalized_matrix, nan=0.0001)
+    return normalized_matrix
+def cosineSimilartydis(A,B):
+    A=A/(torch.norm(A,dim=1,p=2,keepdim=True)+0.000001)
+    B=B/(torch.norm(B,dim=1,p=2,keepdim=True)+0.000001)
+    W=torch.mm(A,B.t())
+    max_values, _ = torch.max(W, axis=0)
+    min_values, _ = torch.min(W, axis=0)
+    normalized_matrix = (W - min_values) / (max_values - min_values)
+    normalized_matrix = torch.nan_to_num(normalized_matrix, nan=0.0001)
+    return 1-normalized_matrix
+# # 随机游走参数
+#
+#
+# Step 1: 初始化完全图的转移概率矩阵
+# distances = torch.cdist(data, data, p=2)  # 计算所有点之间的欧几里得距离
+# adj_matrix = torch.exp(-distances)  # 高斯权重：距离越小权重越高
+# def visit(data):
+#     adj_matrix=cosineSimilarty(data,data)
+#     print(np.shape(adj_matrix))
+#     adj_matrix.fill_diagonal_(0)  # 去掉自身连接
+#     transition_matrix = adj_matrix / adj_matrix.sum(dim=1, keepdim=True)  # 归一化为转移概率
+#     return transition_matrix
+def visit(data, alpha=0.5):
+    """
+    根据给定的节点特征矩阵data和参数alpha计算转移矩阵。
+    使用余弦相似度矩阵作为转移的相似度度量。
+    计算公式：r_mu(x_i) = (x_i / mu_i) ^ -alpha
+    """
+    num_nodes = data.size(0)
+    # 计算节点间的余弦相似度矩阵
+    adj_matrix = cosineSimilarty(data, data)
+    # 归一化每一行，确保每行相似度和为1
+    adj_matrix.fill_diagonal_(0)  # 去掉自身连接
+    adj_matrix = torch.nan_to_num(adj_matrix, nan=0.0001)  # 防止NaN值
+    # 归一化为转移概率，确保每行的和为1
+    row_sums = adj_matrix.sum(dim=1, keepdim=True) + 0.000001  # 防止除以零
+    adj_matrix = adj_matrix / row_sums  # 归一化为转移概率
+    # 防止出现概率为零的行（所有相似度为零时）
+    adj_matrix = torch.nan_to_num(adj_matrix, nan=0.0001)  # 替换NaN为小值
+    adj_matrix = torch.clamp(adj_matrix, min=0.0001)  # 防止小于0的概率值
+    # 根据 alpha 修改相似度矩阵
+    transition_matrix = adj_matrix ** (-alpha)  # 应用公式 r_mu(x_i) = (x_i / mu_i) ^ -alpha
+    # 再次归一化转移矩阵，使得每行的和为1
+    transition_matrix = transition_matrix / (transition_matrix.sum(dim=1, keepdim=True) + 0.000001)
+    # 检查是否有行的和仍然为0，若有则设置为均匀分布
+    zero_rows = (transition_matrix.sum(dim=1) == 0)
+    if zero_rows.any():
+        transition_matrix[zero_rows] = 1.0 / num_nodes  # 对于零行，设置均匀分布
+    return transition_matrix
+# 优化方法
+def random_walk_batch_paths(transition_matrix, num_walks, walk_length):
+    """
+    批量化生成随机游走路径，并统计访问频次。
+    """
+    num_nodes = transition_matrix.size(0)
+    visit_matrix = torch.zeros_like(transition_matrix,device='cuda')  # 初始化访问频率矩阵
+    for start_node in range(num_nodes):  # 遍历每个起始节点
+        # 初始化起点
+        paths = torch.full((num_walks, walk_length + 1), start_node, dtype=torch.long,device='cuda')  # 每行一条路径
+        for step in range(walk_length):  # 生成完整路径
+            probs = transition_matrix[paths[:, step]]  # 当前步节点的转移概率
+            next_nodes = torch.multinomial(probs, 1).squeeze()  # 采样下一个节点
+            paths[:, step + 1] = next_nodes
+        # 累计所有路径的访问频率
+        for path in paths:
+            visit_matrix[start_node].index_add_(0, path, torch.ones_like(path, dtype=torch.float,device='cuda'))
+    visit_matrix -= torch.diag(torch.full((num_nodes,), num_walks, dtype=visit_matrix.dtype,device='cuda'))
+    return visit_matrix
+# visit_matrix = random_walk_batch_paths(transition_matrix, num_walks, walk_length)
+#
+# # visit_matrix = random_walk_parallel(transition_matrix, num_walks, walk_length)
+#
+# Step 3: 归一化访问频率为相似度,构建基于阈值的 kNN 图
+def thresholded_knn(visit_matrix,similarity_threshold):
+    similarity_matrix = visit_matrix / visit_matrix.max()
+    thresholded_adj = (similarity_matrix > similarity_threshold).float()  # 保留相似度大于阈值的边
+    return thresholded_adj
+# # Step 5: 贪心覆盖算法选择锚点
+def greedy_cover_with_importance(data, importance_scores, r, num_anchors):
+    """
+    贪心覆盖算法用于选择锚点
+    :param data: 数据点，形状为 (n_samples, n_features)
+    :param importance_scores: 每个点的重要性分数 (随机游走访问频率)
+    :param r: 覆盖半径
+    :param num_anchors: 需要选择的锚点数量
+    :return: 锚点索引
+    """
+    distances = cosineSimilartydis(data,data)  # 计算点对点距离
+    selected = []  # 选择的锚点索引
+    covered = torch.zeros(data.size(0), dtype=torch.bool,device='cuda')  # 覆盖标志位
+    sorted_indices = torch.argsort(importance_scores, descending=True)  # 按重要性排序
+    cluster_selected = torch.zeros(data.size(0), dtype=torch.bool, device='cuda')  # 集群是否被选中锚点标记
+    while len(selected) < num_anchors:
+        # prev_covered_sum = covered.sum().item()  # 上一次覆盖点的数量
+        for idx in sorted_indices:
+            if len(selected) >= num_anchors:
+                break
+            if not covered[idx] and not cluster_selected[idx]:  # 如果当前点未被覆盖，且所属集群未选过锚点
+                selected.append(idx)  # 选择锚点
+                cluster_selected[idx] = 1  # 标记所属集群已选锚点
+                # 将当前锚点覆盖范围内的点标记为已覆盖
+                covered |= distances[idx] <= r
+                covered[idx] = 1#调了半天，锚点自己没有被覆盖
+        selected_anchors = set(selected)  # 当前已选择的锚点集合
+        selected_anchors_tensor = torch.tensor(list(selected_anchors), device='cuda')
+        # 检查是否所有集群都已被选过锚点
+        if covered.sum().item() == data.size(0):
+            print("所有点已被覆盖，重置覆盖状态")
+            # 记录已选的锚点，重置覆盖标志
+            covered[:] = 0
+            covered[selected_anchors_tensor] = 1  # 恢复已选锚点的覆盖状态
+            print(len(selected))
+        # elif covered.sum().item() == prev_covered_sum:
+        #     print("没有新的点被覆盖，终止选择锚点")
+        #     break  # 如果没有新点被覆盖，跳出循环
+    return torch.tensor(selected,device='cuda')
+# 计算节点的重要性（访问频率的总和）
+# node_importance = visit_matrix.sum(dim=1)
+#
+# # 使用贪心覆盖算法选择锚点
+# anchor_indices = greedy_cover_with_importance(data, node_importance, coverage_radius, num_anchors)
+# anchors = data[anchor_indices]  # 提取锚点
+# # Step 6: 可视化结果
+# # from sklearn.decomposition import PCA
+# # import matplotlib.pyplot as plt
+# #
+# # # 假设 data 和 anchors 是 5维张量
+# # pca = PCA(n_components=2, random_state=42)
+# #
+# # # 降维到 2D
+# # data_2d = pca.fit_transform(data.detach().cpu().numpy())
+# # anchors_2d = pca.transform(anchors.detach().cpu().numpy())
+# #
+# # # 绘制统一显示的散点图
+# # plt.figure(figsize=(8, 8))
+# # plt.scatter(data_2d[:, 0], data_2d[:, 1], c='blue', label="Data Points", alpha=0.5, s=30)
+# # plt.scatter(anchors_2d[:, 0], anchors_2d[:, 1], color="red", label="Anchor Points", s=100, edgecolor='black')
+# # plt.title("Unified Visualization with PCA")
+# # plt.legend()
+# # plt.show()
+#
+#
+#
+# # 使用 UMAP 降维
+# reducer = umap.UMAP(n_components=2)
+# data_2d = reducer.fit_transform(data.detach().cpu().numpy())
+# anchors_2d = reducer.transform(anchors.detach().cpu().numpy())
+#
+# # 绘制统一显示图
+# plt.figure(figsize=(8, 8))
+# plt.scatter(data_2d[:, 0], data_2d[:, 1], c='blue', label="Data Points", alpha=0.5, s=30)
+# plt.scatter(anchors_2d[:, 0], anchors_2d[:, 1], color="red", label="Anchor Points", s=20, edgecolor='black')
+# plt.title("Unified Visualization with UMAP")
+# plt.legend()
+# plt.show()

anchors.py ADDED Viewed

	@@ -0,0 +1,22 @@

+from SRW_KNN_greedy import *
+import torch
+def get_anchors(h0,h1,map_pairs,num_unique_labels):
+    #又一个点，提前预训练，统一样本的潜在空间
+    # print(h0.shape[0],num_unique_labels,'ghjhggjf')
+    #初始化随机游走参数
+    num_walks0, walk_length0, similarity_threshold0, num_anchors0, coverage_radius0 = para(h0,h0.shape[0],num_unique_labels)
+    num_walks1, walk_length1, similarity_threshold1, num_anchors1, coverage_radius1 = para(h1, h1.shape[0],num_unique_labels)
+    transition_matrix0,transition_matrix1 = visit(h0),visit(h1)#转移概率矩阵
+    #访问矩阵
+    visit_matrix0,visit_matrix1 = random_walk_batch_paths(transition_matrix0, num_walks0, walk_length0), random_walk_batch_paths(transition_matrix1, num_walks1, walk_length1)
+    #
+    node_importance0, node_importance1 = visit_matrix0.sum(dim=0),visit_matrix1.sum(dim=0)
+    # # 使用贪心覆盖算法选择锚点
+    anchor_indices0 = greedy_cover_with_importance(h0, node_importance0, coverage_radius0, num_anchors0)
+    anchor_indices1 = greedy_cover_with_importance(h1, node_importance1, coverage_radius1, num_anchors1)
+    combined_indices = torch.cat((anchor_indices0, anchor_indices1))
+    unique_indices = torch.unique(combined_indices)#合并索引去重
+    len_indices=len(unique_indices)
+    mapdata0,mapdata1=torch.tensor(map_pairs[0]),torch.tensor(map_pairs[1])
+    anchors0,anchors1 = mapdata0[unique_indices].float(),mapdata1[unique_indices].float()# 提取锚点(降维前）
+    return anchors0,anchors1,len_indices

config.py ADDED Viewed

	@@ -0,0 +1,85 @@

+from easydict import EasyDict
+config = EasyDict()
+from run import dim
+'''3Sources'''
+config.input_features1 =3560
+config.input_features2 =3631
+config.enhidden_features = [2000, 320, 50,6]
+config.dehidden_features1 = [50, 320, 2000,3560]
+config.dehidden_features2 = [50, 320, 2000,3631]
+config.classes = 6
+'''BBCsports'''
+# config.input_features1 =2582
+# config.input_features2 =2544
+# config.enhidden_features = [1500, 200, 50,5]
+# config.dehidden_features1 = [50, 200, 1500,2582]
+# config.dehidden_features2 = [50, 200, 1500,2544]
+# config.classes = 5
+'''Caltech101'''
+# config.input_features1 =1984
+# config.input_features2 =512
+# config.enhidden_features = [500, 320, 50,10]
+# config.dehidden_features1 = [50, 320, 500,1984]
+# config.dehidden_features2 = [50, 320, 500,512]
+# config.classes = 20
+'''ORL_mtv'''
+# config.input_features1 =400
+# config.input_features2 =400
+# config.enhidden_features = [300, 150, 50,10]
+# config.dehidden_features1 = [50, 150, 300,400]
+# config.dehidden_features2 = [50, 150, 300,400]
+# config.classes = 40
+'''Caltech101_7'''
+# config.input_features1 =1984
+# config.input_features2 =512
+# config.enhidden_features = [500, 320, 50,5]
+# config.dehidden_features1 = [50, 320, 500,1984]
+# config.dehidden_features2 = [50, 320, 500,512]
+# config.classes = 7
+'''scene15'''
+# config.input_features1 =20
+# config.input_features2 =59
+# config.enhidden_features = [20, 15, 15,10]
+# config.dehidden_features1 = [15, 15, 20,20]
+# config.dehidden_features2 = [15, 15, 20,59]
+# config.classes = 10
+'''Prokaryotic'''
+# config.input_features1 =393
+# config.input_features2 =438
+# config.enhidden_features = [300, 150, 50,10]
+# config.dehidden_features1 = [50, 150, 300,393]
+# config.dehidden_features2 = [50, 150, 300,438]
+# config.classes = 4
+'''yale_mtv'''
+# config.input_features1 =4096
+# config.input_features2 =3304
+# config.enhidden_features = [1500, 200, 50,5]
+# config.dehidden_features1 = [50, 200, 1500,4096]
+# config.dehidden_features2 = [50, 200, 1500,3304]
+# config.classes = 15
+'''flower17'''
+# config.input_features1 =1360
+# config.input_features2 =1360
+# config.enhidden_features = [1000, 200, 50,5]
+# config.dehidden_features1 = [50, 200, 1000,1360]
+# config.dehidden_features2 = [50, 200, 1000,1360]
+# config.classes = 17
+'''100leaves'''
+# config.input_features1 =64
+# config.input_features2 =64
+# config.enhidden_features = [200, 200, 50,10]
+# config.dehidden_features1 = [50, 200, 200,64]
+# config.dehidden_features2 = [50, 200, 200,64]
+# config.classes = 100
+config.lr = 1e-3
+config.momentum = 0.9#SGD才有的参数，动量通过利用过去梯度的加权平均值来调整当前梯度的方向，避免震荡
+config.weight_decay = 0
+config.w_v = 0
+config.print_step = 10
+config.tensorboard_step = 100
+config.load_iter = 0
+config.train_iters = 5000
+config.is_train = True
+config.use_cuda = True

data_loader.py ADDED Viewed

	@@ -0,0 +1,37 @@

+import mat73
+import numpy as np
+import scipy.io as sio
+import torch
+import random
+from torch.utils.data import Dataset, DataLoader
+from utils import *
+def get_pairs(E_X, E_Y, neg_prop, train_label):
+    view0, view1, labels, real_labels, class_labels0, class_labels1 = [], [], [], [], [], []
+    # construct pos. pairs
+    for i in range(len(E_X)):
+        view0.append(E_X[i])
+        view1.append(E_Y[i])
+        labels.append(1)
+        real_labels.append(1)
+        class_labels0.append(train_label[i])
+        class_labels1.append(train_label[i])
+    # construct neg. pairs by taking each sample in view0 as an anchor and randomly sample neg_prop samples from view1,
+    # which may lead to the so called noisy labels, namely, some of the constructed neg. pairs may in the same category.
+    for j in range(len(E_X)):
+        neg_idx = random.sample(range(len(E_Y)), neg_prop)
+        for k in range(neg_prop):
+            view0.append(E_X[j])
+            view1.append(E_Y[neg_idx[k]])
+            labels.append(0)
+            class_labels0.append(train_label[j])
+            class_labels1.append(train_label[neg_idx[k]])
+            if train_label[j] != train_label[neg_idx[k]]:
+                real_labels.append(0)
+            else:
+                real_labels.append(1)
+    labels = np.array(labels, dtype=np.int64)
+    real_labels = np.array(real_labels, dtype=np.int64)
+    class_labels0, class_labels1 = np.array(class_labels0, dtype=np.int64), np.array(class_labels1, dtype=np.int64)
+    view0, view1 = np.array(view0, dtype=np.float32), np.array(view1, dtype=np.float32)
+    return view0, view1, labels, real_labels, class_labels0, class_labels1

model.py ADDED Viewed

	@@ -0,0 +1,181 @@

+import torch.nn as nn
+from torch import optim
+class endA(nn.Module):
+    def __init__(self, in_features, out_features):
+        super(endA, self).__init__()
+        self.encoder = nn.Sequential(
+            nn.Linear(in_features, out_features),
+            nn.BatchNorm1d(out_features),
+            nn.ReLU(True),
+            nn.Dropout(0.1)
+        )  # 编码
+        #self.decoder[0].weight.data = self.encoder[0].weight.data.transpose(0, 1)
+    def forward(self, x):
+        h = self.encoder(x)
+        return h
+class dedA(nn.Module):
+    def __init__(self, out_features, in_features):
+        super(dedA, self).__init__()
+        self.decoder = nn.Sequential(
+            nn.Linear(out_features, in_features),
+            nn.ReLU(True)
+        )  # 编码
+        # self.decoder[0].weight.data = self.encoder[0].weight.data.transpose(0, 1)
+    def forward(self, x):
+        h = self.decoder(x)
+        return h
+class SdA(nn.Module):
+    def __init__(self, config):
+        super(SdA, self).__init__()
+        layers1 = []
+        layers2 = []
+        layersall1=[]
+        layersall2 = []
+        in_features1 = config.input_features1
+        for out_features in config.enhidden_features:
+            layer1 = endA(in_features1, out_features)
+            in_features1 = out_features
+            layers1.append(layer1)
+        self.layers1 = nn.Sequential(*layers1)  # 就是封装了成了一个
+        in_features=config.enhidden_features[-1]
+        for out_features in config.dehidden_features1:
+            layer2 = dedA(in_features, out_features)
+            in_features = out_features
+            layers2.append(layer2)
+        self.layers2=nn.Sequential(*layers2)
+        layersall1.append(self.layers1)
+        layersall1.append(self.layers2)
+        self.layerll1=nn.Sequential(*layersall1)
+        if config.is_train:
+            self.ce_criterion = nn.CrossEntropyLoss()
+            self.da_optimizers = []
+            for layer1 in self.layers1[:-1]:
+                # optimizer = optim.SGD(layer1.parameters(), lr=config.lr,
+                #                       momentum=config.momentum, weight_decay=config.weight_decay)  # 优化器可以改一下
+                optimizer = optim.Adam(
+                    layer1.parameters(), lr=0.001, betas=(0.9, 0.99), eps=1e-8, weight_decay=0)
+                self.da_optimizers.append(optimizer)
+        layers3 = []
+        layers4 = []
+        in_features2 = config.input_features2
+        for out_features in config.enhidden_features:
+            layer3 = endA(in_features2, out_features)
+            in_features2 = out_features
+            layers3.append(layer3)
+        self.layers3 = nn.Sequential(*layers3)  # 就是封装了成了一个
+        in_features=config.enhidden_features[-1]
+        for out_features in config.dehidden_features2:
+            layer4 = dedA(in_features, out_features)
+            in_features = out_features
+            layers4.append(layer4)
+        self.layers4=nn.Sequential(*layers4)
+        layersall2.append(self.layers3)
+        layersall2.append(self.layers4)
+        self.layerll2 = nn.Sequential(*layersall2)
+        # for layer in self.layers3:
+        #     print(layer)
+        if config.is_train:
+            self.ce_criterion = nn.CrossEntropyLoss()
+            self.da_optimizers = []
+            for layer1 in self.layers3[:-1]:
+                # optimizer = optim.SGD(layer1.parameters(), lr=config.lr,
+                #                       momentum=config.momentum, weight_decay=config.weight_decay)  # 优化器可以改一下
+                optimizer=optim.Adam(layer1.parameters(),lr=0.001,betas=(0.9,0.99),eps=1e-8,weight_decay=0)
+                self.da_optimizers.append(optimizer)
+            # 每一层的优化器
+    def forward(self, x1, x2):
+        h1, h2 = x1, x2
+        for layer1 in self.layers1:
+            h1 = layer1(h1)
+        h3 = h1
+        for layer2 in self.layers2:
+            h3 = layer2(h3)
+        for layer3 in self.layers3:
+            h2 = layer3(h2)
+        h4=h2
+        for layer4 in self.layers4:
+            h4 = layer4(h4)
+        return h1, h2, h3, h4  # 不是很理解构
+    def regularization_loss(self):
+        l2_lambda = 0.001
+        l2_norm = sum(p.pow(2).sum() for p in self.parameters())
+        return l2_lambda * l2_norm
+class Anchormodel(nn.Module):
+    def __init__(self,dim,outfeature):
+        super(Anchormodel, self).__init__()
+        self.encoder0 = nn.Sequential(
+            nn.Linear(dim, 1024),
+            nn.BatchNorm1d(1024),
+            nn.ReLU(True),
+            nn.Dropout(0.2),
+            nn.Linear(1024, 1024),
+            nn.BatchNorm1d(1024),
+            nn.ReLU(True),
+            nn.Dropout(0.2),
+            nn.Linear(1024, 1024),
+            nn.BatchNorm1d(1024),
+            nn.ReLU(True),
+            nn.Dropout(0.2),
+            nn.Linear(1024, outfeature),
+            nn.BatchNorm1d(outfeature),
+            nn.ReLU(True)
+        )
+        self.encoder1 = nn.Sequential(
+            nn.Linear(dim, 1024),
+            nn.BatchNorm1d(1024),
+            nn.ReLU(True),
+            nn.Dropout(0.2),
+            nn.Linear(1024, 1024),
+            nn.BatchNorm1d(1024),
+            nn.ReLU(True),
+            nn.Dropout(0.2),
+            nn.Linear(1024, 1024),
+            nn.BatchNorm1d(1024),
+            nn.ReLU(True),
+            nn.Dropout(0.2),
+            nn.Linear(1024, outfeature),
+            nn.BatchNorm1d(outfeature),
+            nn.ReLU(True)
+        )
+        # self.decoder0 = nn.Sequential(nn.Linear(outfeature, 1024), nn.ReLU(), nn.Dropout(0.2), nn.Linear(1024, 1024), nn.ReLU(),
+        #                               nn.Dropout(0.2), nn.Linear(1024, 1024), nn.ReLU(), nn.Dropout(0.2),
+        #                               nn.Linear(1024, dim))
+        # self.decoder1 = nn.Sequential(nn.Linear(outfeature, 1024), nn.ReLU(), nn.Dropout(0.2), nn.Linear(1024, 1024), nn.ReLU(),
+        #                               nn.Dropout(0.2), nn.Linear(1024, 1024), nn.ReLU(), nn.Dropout(0.2),
+        #                               nn.Linear(1024, dim))
+    def forward(self, x0, x1):
+        h0 = self.encoder0(x0.view(x0.size()[0], -1))
+        h1 = self.encoder1(x1.view(x1.size()[0], -1))
+        # z0 = self.decoder0(h0)
+        # z1 = self.decoder1(h1)
+        return h0, h1

run.py ADDED Viewed

	@@ -0,0 +1,310 @@

+import argparse
+import time
+import random
+from model import *
+import math
+import torch,gc
+import torch.nn as nn
+import torch.nn.functional as F
+from train_methods import *
+import logging
+import sys
+import numpy as np
+import matplotlib.pyplot as plt
+from Datasets import *
+from config import *
+from data_loader import *
+import mat73
+from anchors import *
+from Cluster import *
+parser = argparse.ArgumentParser(description='CAPIMAC in PyTorch')
+parser.add_argument('--data', default='1', type=int,
+                    help='choice of dataset, 0-HW,1-3Sources,2BBC,3-Scene15, 4-Caltech101,5-ORL_mtv,6-Caltech_7,7-Reuters,'
+                         '8-20newsgroups,9-100leaves,10-BBC4,11-MSRCv1,12-BDGP,13-HandWritten,14-yale_mtv，15-Wikipedia-test,16-Movies,17-Prokaryotic,18-ALOI,19-flower17')
+parser.add_argument('-bs', '--batch-size', default='1024', type=int, help='number of batch size')
+parser.add_argument('-e', '--epochs', default='200', type=int, help='number of epochs to run')
+parser.add_argument('-lr', '--learn-rate', default='0.0001', type=float, help='learning rate of adam')
+parser.add_argument('-ap', '--aligned-prop', default='0.5', type=float,
+                    help='originally aligned proportions in the partially view-aligned data')
+parser.add_argument('--gpu', default=0, type=int, help='GPU device idx to use.')
+parser.add_argument('-cp', '--complete-prop', default='0.5', type=float,
+                    help='originally complete proportions in the partially sample-missing data')
+parser.add_argument('-m', '--margin', default='5', type=int, help='initial margin')
+parser.add_argument('-s', '--start-fine', default=True, type=bool, help='flag to start use robust loss or not')
+parser.add_argument('-np', '--neg-num', default='30', type=int, help='the ratio of negative to positive pairs')
+parser.add_argument('-noise', '--noisy-training', type=bool, default=True,
+                    help='training with real labels or noisy labels')
+parser.add_argument('-r', '--robust', default=1, type=int, help='use our robust loss or not')
+dim=0
+class NoiseRobustLoss(nn.Module):
+    def __init__(self):
+        super(NoiseRobustLoss, self).__init__()
+    def forward(self, pair_dist, P, margin, use_robust_loss, args):
+        # print(max(pair_dist))
+        dist_sq = pair_dist * pair_dist
+        P = P.to(torch.float32)
+        N = len(P)
+        if use_robust_loss == 1:
+            if args.start_fine:
+                loss = P * dist_sq + (1 - P) * (1 / margin) * torch.pow(
+                    torch.clamp(torch.pow(pair_dist, 0.5) * (0.5*margin - pair_dist), min=0.0), 2)
+            else:
+                loss = P * dist_sq + (1 - P) * torch.pow(torch.clamp(margin - pair_dist, min=0.0), 2)
+        else:
+            loss = P * dist_sq + (1 - P) * torch.pow(torch.clamp(margin - pair_dist, min=0.0), 2)
+        loss = torch.sum(loss) / (2.0 * N)
+        return loss
+def load_data(align_prop,complete_prop,neg_num,is_noise,dataset):
+    global dim
+    NetSeed = random.randint(1, 1000)
+    # NetSeed=72
+    print(NetSeed)
+    np.random.seed(NetSeed)
+    torch.backends.cudnn.deterministic = True
+    torch.manual_seed(NetSeed)  # 为CPU设置随机种子
+    torch.cuda.manual_seed(NetSeed)  # 为当前GPU设置随机种子
+    args = parser.parse_args()
+    all_data = []
+    map_pairs = []
+    label = []
+    train_pairs = []
+    if dataset=='Caltech101_7':
+        path = './datasets/' + dataset + '.mat'  # 路径
+        mat = mat73.loadmat(path)  # 加载mat文件
+    else:
+        mat = sio.loadmat('./datasets/' + dataset + '.mat')
+    if dataset == 'Scene15':
+        data = mat['X'][0][0:2]  # 20, 59 dimensions
+        label = np.squeeze(mat['Y'])
+    elif dataset == 'HandWritten':
+        data = mat['X'][0][1:3]
+        label = np.squeeze(mat['Y'])
+    elif dataset == '3Sources':
+        data = mat['X'][0][0:2]
+        label = np.squeeze(mat['Y'])
+    elif dataset == 'ALOI':
+        data = mat['X'][0][0:2]
+        label = np.squeeze(mat['gt'])
+    elif dataset == 'BBCsports':
+        data = mat['X'][0][0:2]
+        label = np.squeeze(mat['Y'])
+    elif dataset == 'Caltech101':
+        data = mat['X'][0][0:2]
+        label = np.squeeze(mat['Y'])
+    elif dataset == 'Reuters_dim10':
+        data = []  # 18758 samples
+        data.append(normalize(np.vstack((mat['x_train'][0], mat['x_test'][0]))))
+        data.append(normalize(np.vstack((mat['x_train'][1], mat['x_test'][1]))))
+        label = np.squeeze(np.hstack((mat['y_train'], mat['y_test'])))
+    elif dataset == 'ORL_mtv':
+        data = mat['X'][0][0:2]
+        label = np.squeeze(mat['gt'])
+    elif dataset == 'Caltech101_7':
+        data = mat['data'][3:5]
+        data[0], data[1] = np.squeeze(data[0]), np.squeeze(data[1])
+        data[0], data[1] = np.array(data[0]), np.array(data[1])
+        label = np.squeeze(mat['labels'])
+    elif dataset == 'Reuters':
+        data = mat['X'][0][0:2]
+        label = np.squeeze(mat['Y'])
+    elif dataset == '20NewsGroups':
+        data = mat['data'][0][1:3]
+        label = np.squeeze(mat['truelabel'][0][0])
+    elif dataset == '100leaves':
+        mat['data'][0][0], mat['data'][0][1] = mat['data'][0][0].T, mat['data'][0][1].T
+        data = mat['data'][0][0:2]
+        label = np.squeeze(mat['truelabel'][0][0])
+    elif dataset == 'BBC4':
+        data = mat['data'][0][0:2]
+        label = np.squeeze(mat['truelabel'][0][0])
+        # print(label)
+    elif dataset == 'MSRCv1':
+        data = mat['X'][0][1:3]
+        label = np.squeeze(mat['Y'])
+    elif dataset == 'BDGP':
+        mat['X'][0][0], mat['X'][0][1] = mat['X'][0][0].T, mat['X'][0][1].T
+        data = mat['X'][0][0:2]
+        label = np.squeeze(mat['gt'])
+    elif dataset == 'HandWritten':
+        data = mat['X'][0][1:3]
+        label = np.squeeze(mat['Y'])
+    elif dataset == 'yale_mtv':
+        mat['X'][0][0], mat['X'][0][1] = mat['X'][0][0].T, mat['X'][0][1].T
+        data = mat['X'][0][0:2]
+        # print((data))
+        label = np.squeeze(mat['gt'])
+    elif dataset == 'Wikipedia-test':
+        data = mat['X'][0:2][0:2]
+        data = np.squeeze(data.T)
+        # print(data)
+        label = np.squeeze(mat['y'])
+    elif dataset == 'Movies':
+        data = mat['X'][0:2][0:2]
+        data = np.squeeze(data.T)
+        # print(data)
+        label = np.squeeze(mat['y'])
+    elif dataset == 'Prokaryotic':
+        value1 = mat['X'][0][0]
+        value2 = mat['X'][2][0]
+        data = [value1, value2]
+        # print(data)
+        label = np.squeeze(mat['y'])
+    elif dataset == 'flower17':
+        data = mat['X'][0][0:2]
+        label = np.squeeze(mat['Y'])
+    divide_seed = random.randint(1, 1000)
+    train_idx, test_idx = TT_split(len(label), 1 - align_prop, divide_seed)
+    train_label, test_label = label[train_idx], label[test_idx]
+    if dataset == 'Caltech101_7':
+        data[0], data[1] = np.squeeze(data[0]), np.squeeze(data[1])
+    print(np.shape(data[0]))
+    train_X, train_Y, test_X, test_Y = data[0][train_idx], data[1][train_idx], data[0][test_idx], data[1][test_idx]
+    '''获取对齐部分的潜在表示'''
+    map_pairs.append(train_X)
+    map_pairs.append(train_Y)
+    h0 , h1,epoch_time=pretrain(map_pairs, args)
+    all_label = np.concatenate((train_label, test_label))
+    '''获取初始训练数据和测试数据'''
+    if align_prop != 1:
+        shuffle_idx = random.sample(range(len(test_Y)), len(test_Y))
+        test_Y = test_Y[shuffle_idx]
+        test_label_X, test_label_Y = test_label, test_label[shuffle_idx]
+    elif align_prop == 1:
+        all_data.append(train_X.T)
+        all_data.append(train_Y.T)
+    '''不完整部分'''
+    test_mask = get_sn(2, len(test_label), 1 - complete_prop)
+    X_mask, Y_mask = test_mask[:, 0].astype(np.bool_), test_mask[:, 1].astype(np.bool_)
+    # test_X[~X_mask] = 0
+    # test_Y[~Y_mask] = 0
+    test_X, test_Y = test_X[X_mask], test_Y[Y_mask]
+    test_label_X, test_label_Y=test_label_X[X_mask], test_label_Y[Y_mask]
+    if align_prop != 1:
+        all_label_X = np.concatenate((train_label, test_label_X))
+        all_label_Y = np.concatenate((train_label, test_label_Y))
+        all_data.append(np.concatenate((train_X, test_X)).T)
+        all_data.append(np.concatenate((train_Y, test_Y)).T)
+        all_label = np.concatenate((train_label, test_label))
+        # all_label_X = test_label_X
+        # all_label_Y = test_label_Y
+        # all_data.append(test_X.T)
+        # all_data.append(test_Y.T)
+        # all_label = test_label
+    elif align_prop == 1:
+        all_label_X, all_label_Y = train_label, train_label
+        all_label = train_label
+    '''构建训练对'''
+    view0, view1, noisy_labels, real_labels, _, _ = get_pairs(train_X, train_Y, neg_num, train_label)
+    count = 0
+    for i in range(len(noisy_labels)):
+        if noisy_labels[i] != real_labels[i]:
+            count += 1
+    print('noise rate of the constructed neg. pairs is ', round(count / (len(noisy_labels) - len(train_X)), 2))
+    if is_noise == 0:  # training with real_labels, v/t with real_labels
+        print("----------------------Training with real_labels----------------------")
+        train_pair_labels = real_labels
+    else:  # training with labels, v/t with real_labels
+        print("----------------------Training with noisy_labels----------------------")
+        train_pair_labels = noisy_labels
+    '''初始化锚点'''
+    num_unique_labels = np.unique(all_label).shape[0]
+    anchors0,anchors1,len_indices=get_anchors(h0,h1,map_pairs,num_unique_labels)#h0是tensor
+    '''数据重表示'''
+    view0,view1,all_data[0],all_data[1]=torch.from_numpy(view0).float(),torch.from_numpy(view1).float(),torch.from_numpy(all_data[0]).float(),torch.from_numpy(all_data[1]).float()
+    view0, view1, all_data[0],all_data[1]=find_nanchor(anchors0,view0),find_nanchor(anchors1,view1),find_nanchor(anchors0,all_data[0].T),find_nanchor(anchors1,all_data[1].T)
+    #锚点数×样本数,增强锚点图
+    view0, view1, all_data[0], all_data[1]=np.array(view0),np.array(view1),np.array(all_data[0]),np.array(all_data[1])
+    print(np.shape(view0),'view0')
+    train_pairs.append(view0)
+    train_pairs.append(view1)
+    train_pair_real_labels = real_labels
+    dim=view0.shape[0]
+    return train_pairs, train_pair_labels, train_pair_real_labels, all_data, all_label, all_label_X, all_label_Y, dim,num_unique_labels,divide_seed
+def normalize(x):
+    x = (x - np.tile(np.min(x, axis=0), (x.shape[0], 1))) / np.tile((np.max(x, axis=0) - np.min(x, axis=0)),
+                                                                    (x.shape[0], 1))
+    return x
+def loader(train_bs, align_prop, complete_prop,neg_num, is_noise, dataset):
+    """
+    :param train_bs: batch size for training, default is 1024
+    :param neg_prop: negative / positive pairs' ratio
+    :param test_prop: known aligned proportions for training MvCLN
+    :param is_noise: training with noisy labels or not, 0 --- not, 1 --- yes
+    :param data_idx: choice of dataset
+    :return: train_pair_loader including the constructed pos. and neg. pairs used for training MvCLN, all_loader including originally aligned and unaligned data used for testing MvCLN
+    """
+    train_pairs, train_pair_labels, train_pair_real_labels, all_data, all_label, all_label_X, all_label_Y, dim,num_unique_labels,divide_seed\
+        = load_data(align_prop,complete_prop,neg_num,is_noise, dataset)
+    train_pair_dataset = GetDataset(train_pairs, train_pair_labels, train_pair_real_labels)
+    train_pair_loader = DataLoader(
+        train_pair_dataset,
+        batch_size=train_bs,
+        shuffle=True,
+        drop_last=True
+    )
+    return train_pair_loader, all_data, all_label, all_label_X, all_label_Y, dim,num_unique_labels,divide_seed
+if __name__ == '__main__':
+    for i in range(1):
+        args = parser.parse_args()
+        data_name = ['HandWritten', '3Sources', 'BBCsports', 'Scene15', 'Caltech101', 'ORL_mtv', 'Caltech101_7', 'Reuters',
+                 '20NewsGroups','100leaves','BBC4','MSRCv1','BDGP','HandWritten','yale_mtv','Wikipedia-test','Movies','Prokaryotic','ALOI','flower17']
+        train_pair_loader, all_data, all_label, all_label_X, all_label_Y, dim, outfeature ,divide_seed=loader(args.batch_size, args.aligned_prop,args.complete_prop,args.neg_num,args.noisy_training,data_name[args.data])
+        model = Anchormodel(dim,outfeature).to(args.gpu)
+        criterion = NoiseRobustLoss().to(args.gpu)
+        # criterion_mse = nn.MSELoss().to(args.gpu)
+        optimizer = torch.optim.Adam(model.parameters(), lr=args.learn_rate)
+        CAR_list = []
+        acc_list, nmi_list, ari_list,f_list,f1_list,pre_list,pre2_list,rec_list,pur_list = [], [], [],[], [], [],[], [], []
+        train_time = 0
+        all_data[0], all_data[1]=torch.from_numpy(all_data[0]), torch.from_numpy(all_data[1])
+        for i in range(0, args.epochs + 1):
+            if i == 0:
+                with torch.no_grad():
+                    epoch_time = train2(train_pair_loader, model, criterion, optimizer, i, args)
+            else:
+                epoch_time = train2(train_pair_loader, model, criterion, optimizer, i, args)
+            # test
+            v0, v1, pred_label, alignment_rate = tiny_infer(model, args.gpu, all_data, all_label_X, all_label_Y)
+            CAR_list.append(alignment_rate)
+            data = []
+            data.append(v0)
+            data.append(v1)
+            y_pred, ret, accuracy, nmi, ari, f_score, f_score2, precision, precision2, recall, purity = Clustering(data,
+                                                                                                                   pred_label)
+            if i % 10 == 0:
+                print(accuracy, nmi, ari, f_score, f_score2, precision, precision2, recall, purity)
+                # logging.info("******** testing ********")
+                # logging.info(
+                #     "CAR={} kmeans: acc={} nmi={} ari={}".format(round(alignment_rate, 4), ret['kmeans']['accuracy'],
+                #                                                  ret['kmeans']['NMI'], ret['kmeans']['ARI']))
+            acc_list.append(ret['kmeans']['ACC'])
+            nmi_list.append(ret['kmeans']['NMI'])
+            ari_list.append(ret['kmeans']['ARI'])
+            f_list.append(ret['kmeans']['F1'])
+            f1_list.append(ret['kmeans']['F2'])
+            pre_list.append(ret['kmeans']['PRE'])
+            pre2_list.append(ret['kmeans']['PRE2'])
+            rec_list.append(ret['kmeans']['REC'])
+            pur_list.append(ret['kmeans']['PUR'])
+        print('ACC:', max(acc_list))
+        print("NMI:", max(nmi_list))
+        print("ARI:", max(ari_list))
+        print("F1:", max(f_list))
+        print("F2:", max(f1_list))
+        print("PRE:", max(pre_list))
+        print("PRE2:", max(pre2_list))
+        print("REC:", max(rec_list))
+        print("PUR:", max(pur_list))
+        logging.info('******** End, training time = {} s ********'.format(round(train_time, 2)))

sample_kernal.py ADDED Viewed

	@@ -0,0 +1,44 @@

+import numpy as np
+from sklearn.metrics.pairwise import cosine_similarity
+# 高斯核函数
+def gaussian_kernel(x, x_i, bandwidth):
+    return np.exp(-0.5 * ((x - x_i) / bandwidth) ** 2)
+# 核回归插值函数（支持多维）
+def kernel_regression_multi_dim(x_known, y_known, x_targets, bandwidth):
+    """
+    x_known: 已知点的 x 坐标 (1D array)
+    y_known: 已知点的 y 值，多维数组 (2D array, shape: [n_samples, n_features])
+    x_target: 需要插值的 x 坐标 (scalar)
+    bandwidth: 核函数的带宽参数
+    """
+    # 计算核权重
+    y_targets = []  # 存储每个目标点的插值结果
+    for x_target in x_targets:
+        # 计算核权重
+        weights = np.array([gaussian_kernel(x_target, x_i, bandwidth) for x_i in x_known])
+        weights /= weights.sum()  # 权重归一化
+        # 对每个维度分别插值
+        y_target = np.sum(weights[:, np.newaxis] * y_known, axis=0)
+        y_targets.append(y_target)
+    return np.array(y_targets)
+def insert_and_sort(x_known, y_known, x_targets, y_targets):
+    # 合并数据
+    # print(np.shape(y_known))
+    # print(np.shape(y_targets))
+    x_combined = np.concatenate((x_known, x_targets))
+    y_combined = np.vstack((y_known, y_targets))
+    # 按 x_combined 排序
+    sorted_indices = np.argsort(x_combined)
+    x_known_sorted = x_combined[sorted_indices]
+    y_known_sorted = y_combined[sorted_indices]
+    return x_known_sorted, y_known_sorted

train_methods.py ADDED Viewed

	@@ -0,0 +1,73 @@

+import numpy as np
+from model import SdA
+from config import *
+import torch.nn as nn
+import torch
+import time
+import logging
+import torch.nn.functional as F
+def train1(train_pairs, model, criterion, optimizer, epoch, args):
+    if epoch % 10 == 0:
+        logging.info("=======> Train epoch: {}/{}".format(epoch, args.epochs))
+    model.train()
+    time0 = time.time()
+    loss_value = 0
+    x0,x1=torch.from_numpy(train_pairs[0]).float(),torch.from_numpy(train_pairs[1]).float()
+    x0, x1 = x0.to(args.gpu), x1.to(args.gpu)
+    # print(np.shape(x0))
+    try:
+        h0, h1, d0, d1 = model(x0, x1)
+    except:
+        print("error raise in batch",epoch)
+    #
+    # x0, x1 = torch.squeeze(x0), torch.squeeze(x1)
+    loss = criterion(x0, d0)
+    loss += criterion(x1, d1)
+    loss += model.regularization_loss()#l2正则化
+    loss_value += loss.item()
+    if epoch != 0:
+        optimizer.zero_grad()
+        loss.backward()
+        optimizer.step()
+    epoch_time = time.time() - time0
+    return h0 , h1,epoch_time
+def pretrain(train_pairs, args):
+    model = SdA(config).to(args.gpu)
+    criterion = nn.MSELoss().to(args.gpu)
+    optimizer = torch.optim.Adam(model.parameters(), lr=args.learn_rate)
+    # 'train'
+    for i in range(0, args.epochs + 1):
+        if i == 0:
+            with torch.no_grad():
+                h0, h1, epoch_time = train1(train_pairs, model, criterion, optimizer, i, args)
+        else:
+            h0, h1, epoch_time = train1(train_pairs, model, criterion, optimizer, i, args)
+    return h0, h1, epoch_time
+def train2(train_loader, model, criterion,optimizer, epoch, args):
+    model.train()
+    time0 = time.time()
+    loss_value = 0
+    for batch_idx, (x0, x1, labels, real_labels) in enumerate(train_loader):
+        # labels refer to noisy labels for the constructed pairs, while real_labels are the clean labels for these pairs
+        x0, x1, labels, real_labels = x0.to(args.gpu), x1.to(args.gpu), labels.to(args.gpu), real_labels.to(args.gpu)
+        print(np.shape(x0))
+        try:
+            h0, h1 = model(x0.view(x0.size()[0], -1), x1.view(x1.size()[0], -1))
+        except:
+            print("error raise in batch", batch_idx)
+        pair_dist = F.pairwise_distance(h0, h1)
+        loss = criterion(pair_dist, labels, args.margin, args.robust, args)
+        # loss1=criterion_mse(z0, z1)
+        # print(loss1,'loss')
+        loss_value += loss.item()
+        if epoch != 0:
+            optimizer.zero_grad()
+            loss.backward()
+            optimizer.step()
+    epoch_time = time.time() - time0
+    return epoch_time

utils.py ADDED Viewed

	@@ -0,0 +1,80 @@

+import numpy as np
+import random
+from sklearn.preprocessing import OneHotEncoder
+from numpy.random import randint
+import math
+import torch
+def TT_split(n_all, test_prop, seed):
+    '''
+    split data into training, testing dataset
+    '''
+    random.seed(seed)
+    random_idx = random.sample(range(n_all), n_all)
+    train_num = np.ceil((1-test_prop) * n_all).astype(int)
+    train_idx = random_idx[0:train_num]
+    test_num = np.floor(test_prop * n_all).astype(int)
+    test_idx = random_idx[-test_num:]
+    return train_idx, test_idx
+def get_sn(view_num, alldata_len, missing_rate):
+    """Randomly generate incomplete data information, simulate partial view data with complete view data
+    :param view_num:view number
+    :param alldata_len:number of samples
+    :param missing_rate:Defined in section 4.3 of the paper
+    :return:Sn
+    """
+    missing_rate = missing_rate / 2
+    one_rate = 1.0 - missing_rate
+    if one_rate <= (1 / view_num):
+        enc = OneHotEncoder()  # n_values=view_num
+        view_preserve = enc.fit_transform(randint(0, view_num, size=(alldata_len, 1))).toarray()
+        return view_preserve
+    error = 1
+    if one_rate == 1:
+        matrix = randint(1, 2, size=(alldata_len, view_num))
+        return matrix
+    max_iterations = 200  # 设置最大循环次数
+    iterations = 0  # 初始化循环次数
+    while error >= 0.005 and iterations < max_iterations:
+        enc = OneHotEncoder()  # n_values=view_num
+        view_preserve = enc.fit_transform(randint(0, view_num, size=(alldata_len, 1))).toarray()#生成一个len^view的矩阵，矩阵每一行只有一个1
+        one_num = view_num * alldata_len * one_rate - alldata_len
+        ratio = one_num / (view_num * alldata_len)#0.25
+        matrix_iter = (randint(0, 100, size=(alldata_len, view_num)) < int(ratio * 100)).astype(int)
+        a = np.sum(((matrix_iter + view_preserve) > 1).astype(int))
+        one_num_iter = one_num / (1 - a / one_num)
+        ratio = one_num_iter / (view_num * alldata_len)
+        matrix_iter = (randint(0, 100, size=(alldata_len, view_num)) < int(ratio * 100)).astype(int)
+        matrix = ((matrix_iter + view_preserve) > 0).astype(int)
+        ratio = np.sum(matrix) / (view_num * alldata_len)
+        error = abs(one_rate - ratio)
+        iterations=iterations+1
+    return matrix
+def cosineSimilartydis(A,B):
+    A=A/(torch.norm(A,dim=1,p=2,keepdim=True)+0.000001)
+    B=B/(torch.norm(B,dim=1,p=2,keepdim=True)+0.000001)
+    W=torch.mm(A,B.t())
+    max_values, _ = torch.max(W, axis=0)
+    min_values, _ = torch.min(W, axis=0)
+    denominator = max_values - min_values
+    denominator = torch.clamp(denominator, min=1e-6)
+    normalized_matrix = (W - min_values) / denominator
+    return 1-normalized_matrix
+def find_nanchor(A,B):
+    print(A.device)
+    W=cosineSimilartydis(A, B)#表示距离
+    n = math.ceil(W.shape[0]/19)
+    # print(n)
+    # 复制矩阵A以避免修改原始矩阵
+    modified_matrix_A = W.clone()
+    print(modified_matrix_A.device,'de')
+    for col in range(modified_matrix_A.shape[1]):
+        min_indices = np.argpartition(modified_matrix_A[:, col], n)[:n]
+        modified_matrix_A[min_indices, col] = 0
+    return modified_matrix_A