filenames = ["data/THU/train.conll", "data/THU/dev.conll"] def get_pos_relation(filenames): dict_pos = {} dict_pos2 = {} dict_relation = {} for filename in filenames: with open(filename,"r",encoding="utf8") as f: while 1: line = f.readline() if not line: break if line.strip()=="": continue id,form,lemma,cpostag,postag,feats,head,deprel = line.strip().split("\t")[:8] if cpostag not in dict_pos: dict_pos[cpostag] = 0 dict_pos[cpostag] += 1 if postag not in dict_pos2: dict_pos2[postag] = 0 dict_pos2[postag] += 1 if deprel not in dict_relation: dict_relation[deprel] = 0 dict_relation[deprel] += 1 list_cpostag = list(dict_pos.items()) list_cpostag.sort(key=lambda x:x[1],reverse=True) list_postag = list(dict_pos2.items()) list_postag.sort(key=lambda x:x[1],reverse=True) list_relation = list(dict_relation.items()) list_relation.sort(key=lambda x:x[1],reverse=True) print(list_cpostag,len(list_cpostag)) print(list_postag,len(list_postag)) print(list_relation,len(list_relation)) with open("cpostag.txt","w",encoding="utf8") as f: for k,v in list_cpostag: f.write("%s\t%s\n"%(k,v)) with open("postag.txt","w",encoding="utf8") as f: for k,v in list_postag: f.write("%s\t%s\n"%(k,v)) with open("relation.txt","w",encoding="utf8") as f: for k,v in list_relation: f.write("%s\t%s\n"%(k,v)) def get_legal_postag(minnum=100): with open("postag.txt","r",encoding="utf8") as f: lines = f.readlines() legal_postag = [] for line in lines: line = line.strip() k,v = line.split("\t") if int(v)>=minnum: legal_postag.append(k) return legal_postag def get_legal_relation(minnum=100): with open("relation.txt","r",encoding="utf8") as f: lines = f.readlines() legal_relation = [] for line in lines: line = line.strip() k,v = line.split("\t") if int(v)>=minnum: legal_relation.append(k) return legal_relation if __name__ == '__main__': # get_pos_relation(filenames) get_legal_postag() get_legal_relation()