1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677 |
- #!/usr/bin/python
- # coding=utf-8
- #Function:
- # match mention and entity
- #Data:
- # 2016-3-16
- import re
- MAX_LEN = 50
- def link(str, dict):
- str, flag = removeFx(str)
- # remove the mention that is too long
- if len(str) > 50:
- return None
- # remove the mention has symbol other than chinese
- p = r'\d+'
- m = re.findall(p, str)
- if m != []:
- return None
- # remove the mention has word that implies not a comp mention
- negativeword = ['交易所', '证监会', '银行', '监督', '管理', '委员会', '国务院','保监会', '政府', '酒店', '财政局', '事务所', '商务部', '发改委', '证券报']
- for word in negativeword:
- if str.find(word) >= 0:
- return None
-
- entity = match(str, dict)
- if entity == None:
- if flag == False:
- return None
- else:
- return str
- else:
- return entity
-
- # remove the common prefix and suffix
- def removeFx(str):
- flag = False
- dict1 = ['(', ')', '(', ')']
- dict2 = ['股份', '有限', '公司', '集团', '投资']
- comp = ['总公司','公司','有限','集团','股份','投资','发展','责任','合伙','销售','合作']
- symbol = ['(',')','《','》','(',')']
- for word in symbol:
- str = str.replace(word, '')
- for word in comp:
- str = str.replace(word, '')
- flag = True
- return str, flag
- def loaddict(filename):
- dict = []
- file = open(filename, 'r')
- for line in file.readlines():
- line = line.strip('\n')
- dict.append(line)
- return dict
- def match(mention, entity_dict):
- '''
- testing if a mention is an entity
- '''
- for entity in entity_dict:
- res = entity_match(mention, entity)
- if res != None:
- return res
- return None
- def entity_match(mention, entity):
- '''
- testing if a mention matchs a entity
- '''
- if mention.find(entity) >= 0:
- return entity
- else:
- return None
|