博客
关于我
强烈建议你试试无所不能的chatGPT,快点击我
聚类算法效果评估entropy purity nmi
阅读量:6374 次
发布时间:2019-06-23

本文共 5416 字,大约阅读时间需要 18 分钟。

1.数据管理脚本:原始文件格式id\tclusterId\tgoldstandardId

DataManagement.py

#
!/usr/bin/python
import cPickle as p;
import sys;
import re;
if(
__name__==
"
__main__
"):
    filename=str(sys.argv[1]);
    preturn=re.compile(
'
(^\s+|\s+$)
');
    fidsrc=file(filename,
'
r
');
    clusters={}; 
#
(key,[])
    goldstandards={};
#
(key,[])
    
for line 
in fidsrc.readlines():
        line=preturn.sub(
'',line);
        m=line.split(
'
\t
');
        
#
print m
        
#
s=raw_input('please enter');
        
if(len(m)==3):
#
if 
            
if(
not clusters.has_key(int(m[1]))):
                clusters[int(m[1])]=[];
                clusters[int(m[1])].append(int(m[0]));
            
else:
                 clusters[int(m[1])].append(int(m[0]));
            
if(
not goldstandards.has_key(int(m[2]))):
                goldstandards[int(m[2])]=[];
                goldstandards[int(m[2])].append(int(m[0]));
            
else:
                goldstandards[int(m[2])].append(int(m[0]));
    fidclusters=file(sys.argv[2],
'
w
');
    fidgoldstandards=file(sys.argv[3],
'
w
');
    p.dump(clusters,fidclusters);
    fidclusters.close();
    p.dump(goldstandards,fidgoldstandards);
    fidgoldstandards.close();
    fidsrc.close();
    
print 
'
%s has finished!
'%sys.argv[0];

 

EvaluationClusterAlgorithm.py

#
!/usr/bin/python
#
-*- coding:cp936 -*-
import re;
import cPickle as mypickle;
import sys;
import math;
class Evaluation:
    
def 
__init__(self,clusterfid,goldstandardfid):
        self.clusters=mypickle.load(file(clusterfid));
#
get the cluster algorithm results
        self.goldstandards=mypickle.load(file(goldstandardfid));
#
get the gold-standard answers
        tempclusterkeys=self.clusters.keys();
        tempclusterkeys.sort();
        tempgoldstandardkeys=self.goldstandards.keys();
        tempgoldstandardkeys.sort();
        self.k=len(tempclusterkeys);
        self.q=len(tempgoldstandardkeys);
        self.minclusterId=tempclusterkeys[0];
#
最小聚类ID
        self.maxclusterId=tempclusterkeys[self.k-1];
#
最大聚类ID
        self.mingoldstandardId=tempgoldstandardkeys[0];
        self.maxgoldstandardId=tempgoldstandardkeys[self.q-1];
        self.coocurrence={};
#
(clusterId,goldstandardId)=num;store the number of documents shared by clusterId and goldstandardId;
        N1=0;
        N2=0;
        
for m 
in tempclusterkeys:
            N1=N1+len(self.clusters[m]);
        
for m 
in tempgoldstandardkeys:
            N2=N2+len(self.goldstandards[m]);
        
if(N1==N2):
            self.N=N1;
#
num of documents
        
else:
            
print 
'
there is a error N1=%d,N2=%d,please reexamine the data source
'%(N1,N2);
    
def GenerateCoocurrence(self):
        
for key_cluster 
in self.clusters.keys():
            set1=set(self.clusters[key_cluster]);
            
for key_gold 
in self.goldstandards.keys():
                set2=set(self.goldstandards[key_gold]);
                setintersect=set1&set2;
                Num=len(setintersect);
                
if(
not self.coocurrence.has_key((key_cluster,key_gold))):
                    self.coocurrence[(key_cluster,key_gold)]=Num;
                
            
    
def CalPurityForPerCluster(self,clusterId):
        result=0.0;
        NumCollection=[];
        
for Id 
in range(self.mingoldstandardId,self.maxgoldstandardId+1):
            NumCollection.append(self.coocurrence[(clusterId,Id)]);
        NumCollection.sort();
        result=float(NumCollection[len(NumCollection)-1])/float(len(self.clusters[clusterId]));
        
        
return result;
    
def CalPurity(self):
        result=0.0;
        
for clusterId 
in range(self.minclusterId,self.maxclusterId+1):
            purityPer=self.CalPurityForPerCluster(clusterId);
            result=result+float(len(self.clusters[clusterId]))*purityPer/float(self.N);
        
return result;
    
def CalEntropyFormula(self,seq):
        result=0.0;
        
for elemP 
in seq:
            
if(elemP>0):
                result=result+elemP*math.log(elemP,2);
        
return -result;
    
def CalEntropyForPerCluster(self,clusterId):
        seq=[];
        result=0;
        
for Id 
in range(self.mingoldstandardId,self.maxgoldstandardId+1):
            Prob=float(self.coocurrence[(clusterId,Id)])/float(len(self.clusters[clusterId]));
            seq.append(Prob);
        result=self.CalEntropyFormula(seq);
        
return result;
    
def CalEntropy(self):
        result=0;
        
for clusterId 
in range(self.minclusterId,self.maxclusterId+1):
            entropyPer=self.CalEntropyForPerCluster(clusterId);
            result=result+float(len(self.clusters[clusterId]))*entropyPer/float(self.N);
        
return result;
    
def CalMutualInformation(self):
        result=0.0;
        
for clusterId 
in range(self.minclusterId,self.maxclusterId+1):
            N_c=len(self.clusters[clusterId]);
            
for goldId 
in range(self.mingoldstandardId,self.maxgoldstandardId+1):
                N_g=len(self.goldstandards[goldId]);
                N_cg=self.coocurrence[(clusterId,goldId)];
                part=float(self.N)*float(N_cg)/(N_c*N_g);
                
if(part>0):
                    result=result+(float(N_cg)/float(self.N))*math.log(part,2);
        
return result;
    
def CalNMI(self):
        NMI=0.0;
        seq1=[];
#
calculate the entropy of automated clusters
        seq2=[];
#
calculate the entropy of gold-standard clusters
        
for clusterId 
in range(self.minclusterId,self.maxclusterId+1):
            Prob=float(len(self.clusters[clusterId]))/float(self.N);
            seq1.append(Prob);
        
for goldId 
in range(self.mingoldstandardId,self.maxgoldstandardId+1):
            Prob=float(len(self.goldstandards[goldId]))/float(self.N);
            seq2.append(Prob);
        H1=self.CalEntropyFormula(seq1);
        H2=self.CalEntropyFormula(seq2);
        IG=self.CalMutualInformation();
        NMI=2*IG/(H1+H2);
        
return NMI;
        
        
        
if(
__name__==
"
__main__
"):
    clusterAddress=str(sys.argv[1]);
    goldAddress=str(sys.argv[2]);
    e= Evaluation(clusterAddress,goldAddress);
    
print 
'
聚类算法产生簇个数%d
'%e.k;
    
print  
'
人工标注的标准答案中簇个数%d
'%e.q;
    
print 
'
文档总数%d
'%e.N;
    
print 
'
最小聚类ID标号%d
'%e.minclusterId;
    
print 
'
最大聚类ID标号%d
'%e.maxclusterId;
    
print 
'
标准答案中最小聚类ID标号%d
'%e.mingoldstandardId;
    
print 
'
标准答案中最大聚类ID标号%d
'% e.maxgoldstandardId;
    e.GenerateCoocurrence();
    
#
for m in e.coocurrence:
       
#
 print m;
       
#
 print e.coocurrence[m];
       
#
 print '***************************'
    purity=e.CalPurity();
    
print 
'
纯度为%f
'% purity;
    
#
a=[0.2,0.3,0.5,0];
    
#
print e.CalEntropyFormula(a);
    entropy= e.CalEntropy();
    
print 
'
熵为%f
'%entropy;
    nmi=e.CalNMI();
    
print 
'
归一化互信息为%f
'%nmi
   
        

 

代码调用示意图

 

 

 

 

 

转载地址:http://vojqa.baihongyu.com/

你可能感兴趣的文章
利用Oracle VPD实现行级安全保护(一)
查看>>
Spring中数据绑定的两种方式(BeanWrapperImpl或者DataBinder)
查看>>
一个关于Cobar 的释疑
查看>>
Outlook 2007对于不同(Gmail.Sina.Sohu.126.163.Yahoo.Hotmail)Email账号设置
查看>>
NUMA与英特尔下一代Xeon处理器学习心得(9)
查看>>
Inter VLAN routing router on GNS3
查看>>
一个简单的密码学实例
查看>>
Vlan配置详解之单臂路由
查看>>
沟通的艺术之幻灯片这奇女子
查看>>
【一天一个shell命令】文本内容操作系列-awk补充二(函数)
查看>>
自己分析的子网划分原理
查看>>
嘀嘀和快的上演十月围城 020模式被强化
查看>>
Java减肥高手Xtend 捆绑Eclipse IDE
查看>>
监控利器Nagios的替代者Icinga
查看>>
Windows脚本初探之VBScript常量和变量
查看>>
Gradle 1.12用户指南翻译——第三十章. CodeNarc 插件
查看>>
Oracle 10g R2升级到Oracle 11g R2
查看>>
Oracle Database字符集(1)--案例分析
查看>>
kvm 性能调优
查看>>
如何禁止远程桌面时使用本地administrator登录
查看>>