filenames = ["data/THU/train.conll", "data/THU/dev.conll"]


def get_pos_relation(filenames):

    dict_pos = {}
    dict_pos2 = {}
    dict_relation = {}
    for filename in filenames:
        with open(filename,"r",encoding="utf8") as f:
            while 1:
                line = f.readline()
                if not line:
                    break
                if line.strip()=="":
                    continue
                id,form,lemma,cpostag,postag,feats,head,deprel = line.strip().split("\t")[:8]
                if cpostag not in dict_pos:
                    dict_pos[cpostag] = 0
                dict_pos[cpostag] += 1
                if postag not in dict_pos2:
                    dict_pos2[postag] = 0
                dict_pos2[postag] += 1
                if deprel not in dict_relation:
                    dict_relation[deprel] = 0
                dict_relation[deprel] += 1

    list_cpostag = list(dict_pos.items())
    list_cpostag.sort(key=lambda x:x[1],reverse=True)

    list_postag = list(dict_pos2.items())
    list_postag.sort(key=lambda x:x[1],reverse=True)

    list_relation = list(dict_relation.items())
    list_relation.sort(key=lambda x:x[1],reverse=True)


    print(list_cpostag,len(list_cpostag))
    print(list_postag,len(list_postag))
    print(list_relation,len(list_relation))

    with open("cpostag.txt","w",encoding="utf8") as f:
        for k,v in list_cpostag:
            f.write("%s\t%s\n"%(k,v))

    with open("postag.txt","w",encoding="utf8") as f:
        for k,v in list_postag:
            f.write("%s\t%s\n"%(k,v))

    with open("relation.txt","w",encoding="utf8") as f:
        for k,v in list_relation:
            f.write("%s\t%s\n"%(k,v))

def get_legal_postag(minnum=100):
    with open("postag.txt","r",encoding="utf8") as f:
        lines = f.readlines()
        legal_postag = []
        for line in lines:
            line = line.strip()
            k,v = line.split("\t")
            if int(v)>=minnum:
                legal_postag.append(k)
        return legal_postag

def get_legal_relation(minnum=100):
    with open("relation.txt","r",encoding="utf8") as f:
        lines = f.readlines()
        legal_relation = []
        for line in lines:
            line = line.strip()
            k,v = line.split("\t")
            if int(v)>=minnum:
                legal_relation.append(k)
        return legal_relation

if __name__ == '__main__':
    # get_pos_relation(filenames)
    get_legal_postag()
    get_legal_relation()