entity_match.py 1.8 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677
  1. #!/usr/bin/python
  2. # coding=utf-8
  3. #Function:
  4. # match mention and entity
  5. #Data:
  6. # 2016-3-16
  7. import re
  8. MAX_LEN = 50
  9. def link(str, dict):
  10. str, flag = removeFx(str)
  11. # remove the mention that is too long
  12. if len(str) > 50:
  13. return None
  14. # remove the mention has symbol other than chinese
  15. p = r'\d+'
  16. m = re.findall(p, str)
  17. if m != []:
  18. return None
  19. # remove the mention has word that implies not a comp mention
  20. negativeword = ['交易所', '证监会', '银行', '监督', '管理', '委员会', '国务院','保监会', '政府', '酒店', '财政局', '事务所', '商务部', '发改委', '证券报']
  21. for word in negativeword:
  22. if str.find(word) >= 0:
  23. return None
  24. entity = match(str, dict)
  25. if entity == None:
  26. if flag == False:
  27. return None
  28. else:
  29. return str
  30. else:
  31. return entity
  32. # remove the common prefix and suffix
  33. def removeFx(str):
  34. flag = False
  35. dict1 = ['(', ')', '(', ')']
  36. dict2 = ['股份', '有限', '公司', '集团', '投资']
  37. comp = ['总公司','公司','有限','集团','股份','投资','发展','责任','合伙','销售','合作']
  38. symbol = ['(',')','《','》','(',')']
  39. for word in symbol:
  40. str = str.replace(word, '')
  41. for word in comp:
  42. str = str.replace(word, '')
  43. flag = True
  44. return str, flag
  45. def loaddict(filename):
  46. dict = []
  47. file = open(filename, 'r')
  48. for line in file.readlines():
  49. line = line.strip('\n')
  50. dict.append(line)
  51. return dict
  52. def match(mention, entity_dict):
  53. '''
  54. testing if a mention is an entity
  55. '''
  56. for entity in entity_dict:
  57. res = entity_match(mention, entity)
  58. if res != None:
  59. return res
  60. return None
  61. def entity_match(mention, entity):
  62. '''
  63. testing if a mention matchs a entity
  64. '''
  65. if mention.find(entity) >= 0:
  66. return entity
  67. else:
  68. return None