luojiehua
/
BIDI_ML_INFO_EXTRACTION


			
				
					
						
						
							1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677
							#!/usr/bin/python
# coding=utf-8
#Function:
#	match mention and entity
#Data:
#	2016-3-16

import re

MAX_LEN = 50

def link(str, dict):
    str, flag = removeFx(str)
    # remove the mention that is too long
    if len(str) > 50:
        return None
    # remove the mention has symbol other than chinese
    p = r'\d+'
    m = re.findall(p, str)
    if m != []:
        return None
    # remove the mention has word that implies not a comp mention
    negativeword = ['交易所', '证监会', '银行', '监督', '管理', '委员会', '国务院','保监会', '政府', '酒店', '财政局', '事务所', '商务部', '发改委', '证券报']
    for word in negativeword:
        if str.find(word) >= 0:
            return None
        
    entity = match(str, dict)
    if entity == None:
        if flag == False:
            return None
        else:
            return str
    else:
        return entity
        
# remove the common prefix and suffix
def removeFx(str):
	flag = False
	dict1 = ['(', ')', '（', '）']
	dict2 = ['股份', '有限', '公司', '集团', '投资']
	comp = ['总公司','公司','有限','集团','股份','投资','发展','责任','合伙','销售','合作']
	symbol = ['(',')','《','》','（','）']
	for word in symbol:
		str = str.replace(word, '')
	for word in comp:
		str = str.replace(word, '')
		flag = True
	return str, flag

def loaddict(filename):
	dict = []
	file = open(filename, 'r')
	for line in file.readlines():
		line = line.strip('\n')
		dict.append(line)
	return dict

def match(mention, entity_dict):
	'''
	testing if a mention is an entity
	'''
	for entity in entity_dict:
		res = entity_match(mention, entity)
		if res != None:
			return res	
	return None

def entity_match(mention, entity):
	'''
	testing if a mention matchs a entity
	'''
	if mention.find(entity) >= 0:
		return entity
	else:
		return None