1.数据管理脚本:原始文件格式id\tclusterId\tgoldstandardId
DataManagement.py
# !/usr/bin/python import cPickle as p; import sys; import re; if( __name__== " __main__ "): filename=str(sys.argv[1]); preturn=re.compile( ' (^\s+|\s+$) '); fidsrc=file(filename, ' r '); clusters={}; # (key,[]) goldstandards={}; # (key,[]) for line in fidsrc.readlines(): line=preturn.sub( '',line); m=line.split( ' \t '); # print m # s=raw_input('please enter'); if(len(m)==3): # if if( not clusters.has_key(int(m[1]))): clusters[int(m[1])]=[]; clusters[int(m[1])].append(int(m[0])); else: clusters[int(m[1])].append(int(m[0])); if( not goldstandards.has_key(int(m[2]))): goldstandards[int(m[2])]=[]; goldstandards[int(m[2])].append(int(m[0])); else: goldstandards[int(m[2])].append(int(m[0])); fidclusters=file(sys.argv[2], ' w '); fidgoldstandards=file(sys.argv[3], ' w '); p.dump(clusters,fidclusters); fidclusters.close(); p.dump(goldstandards,fidgoldstandards); fidgoldstandards.close(); fidsrc.close(); print ' %s has finished! '%sys.argv[0];
EvaluationClusterAlgorithm.py
# !/usr/bin/python # -*- coding:cp936 -*- import re; import cPickle as mypickle; import sys; import math; class Evaluation: def __init__(self,clusterfid,goldstandardfid): self.clusters=mypickle.load(file(clusterfid)); # get the cluster algorithm results self.goldstandards=mypickle.load(file(goldstandardfid)); # get the gold-standard answers tempclusterkeys=self.clusters.keys(); tempclusterkeys.sort(); tempgoldstandardkeys=self.goldstandards.keys(); tempgoldstandardkeys.sort(); self.k=len(tempclusterkeys); self.q=len(tempgoldstandardkeys); self.minclusterId=tempclusterkeys[0]; # 最小聚类ID self.maxclusterId=tempclusterkeys[self.k-1]; # 最大聚类ID self.mingoldstandardId=tempgoldstandardkeys[0]; self.maxgoldstandardId=tempgoldstandardkeys[self.q-1]; self.coocurrence={}; # (clusterId,goldstandardId)=num;store the number of documents shared by clusterId and goldstandardId; N1=0; N2=0; for m in tempclusterkeys: N1=N1+len(self.clusters[m]); for m in tempgoldstandardkeys: N2=N2+len(self.goldstandards[m]); if(N1==N2): self.N=N1; # num of documents else: print ' there is a error N1=%d,N2=%d,please reexamine the data source '%(N1,N2); def GenerateCoocurrence(self): for key_cluster in self.clusters.keys(): set1=set(self.clusters[key_cluster]); for key_gold in self.goldstandards.keys(): set2=set(self.goldstandards[key_gold]); setintersect=set1&set2; Num=len(setintersect); if( not self.coocurrence.has_key((key_cluster,key_gold))): self.coocurrence[(key_cluster,key_gold)]=Num; def CalPurityForPerCluster(self,clusterId): result=0.0; NumCollection=[]; for Id in range(self.mingoldstandardId,self.maxgoldstandardId+1): NumCollection.append(self.coocurrence[(clusterId,Id)]); NumCollection.sort(); result=float(NumCollection[len(NumCollection)-1])/float(len(self.clusters[clusterId])); return result; def CalPurity(self): result=0.0; for clusterId in range(self.minclusterId,self.maxclusterId+1): purityPer=self.CalPurityForPerCluster(clusterId); result=result+float(len(self.clusters[clusterId]))*purityPer/float(self.N); return result; def CalEntropyFormula(self,seq): result=0.0; for elemP in seq: if(elemP>0): result=result+elemP*math.log(elemP,2); return -result; def CalEntropyForPerCluster(self,clusterId): seq=[]; result=0; for Id in range(self.mingoldstandardId,self.maxgoldstandardId+1): Prob=float(self.coocurrence[(clusterId,Id)])/float(len(self.clusters[clusterId])); seq.append(Prob); result=self.CalEntropyFormula(seq); return result; def CalEntropy(self): result=0; for clusterId in range(self.minclusterId,self.maxclusterId+1): entropyPer=self.CalEntropyForPerCluster(clusterId); result=result+float(len(self.clusters[clusterId]))*entropyPer/float(self.N); return result; def CalMutualInformation(self): result=0.0; for clusterId in range(self.minclusterId,self.maxclusterId+1): N_c=len(self.clusters[clusterId]); for goldId in range(self.mingoldstandardId,self.maxgoldstandardId+1): N_g=len(self.goldstandards[goldId]); N_cg=self.coocurrence[(clusterId,goldId)]; part=float(self.N)*float(N_cg)/(N_c*N_g); if(part>0): result=result+(float(N_cg)/float(self.N))*math.log(part,2); return result; def CalNMI(self): NMI=0.0; seq1=[]; # calculate the entropy of automated clusters seq2=[]; # calculate the entropy of gold-standard clusters for clusterId in range(self.minclusterId,self.maxclusterId+1): Prob=float(len(self.clusters[clusterId]))/float(self.N); seq1.append(Prob); for goldId in range(self.mingoldstandardId,self.maxgoldstandardId+1): Prob=float(len(self.goldstandards[goldId]))/float(self.N); seq2.append(Prob); H1=self.CalEntropyFormula(seq1); H2=self.CalEntropyFormula(seq2); IG=self.CalMutualInformation(); NMI=2*IG/(H1+H2); return NMI; if( __name__== " __main__ "): clusterAddress=str(sys.argv[1]); goldAddress=str(sys.argv[2]); e= Evaluation(clusterAddress,goldAddress); print ' 聚类算法产生簇个数%d '%e.k; print ' 人工标注的标准答案中簇个数%d '%e.q; print ' 文档总数%d '%e.N; print ' 最小聚类ID标号%d '%e.minclusterId; print ' 最大聚类ID标号%d '%e.maxclusterId; print ' 标准答案中最小聚类ID标号%d '%e.mingoldstandardId; print ' 标准答案中最大聚类ID标号%d '% e.maxgoldstandardId; e.GenerateCoocurrence(); # for m in e.coocurrence: # print m; # print e.coocurrence[m]; # print '***************************' purity=e.CalPurity(); print ' 纯度为%f '% purity; # a=[0.2,0.3,0.5,0]; # print e.CalEntropyFormula(a); entropy= e.CalEntropy(); print ' 熵为%f '%entropy; nmi=e.CalNMI(); print ' 归一化互信息为%f '%nmi
代码调用示意图