corenlp.py 8.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220
  1. import subprocess
  2. import xmltodict
  3. import os
  4. import sys
  5. import logging
  6. import stat
  7. from functools import lru_cache
  8. import iepy
  9. from iepy.utils import DIRS, unzip_from_url
  10. logger = logging.getLogger(__name__)
  11. def detect_java_version():
  12. java_cmd = os.getenv('JAVAHOME')
  13. if not java_cmd:
  14. print('Environment variable JAVAHOME not defined.')
  15. sys.exit(-1)
  16. here = os.path.dirname(os.path.realpath(__file__))
  17. jar = os.path.join(here, 'utils', 'get-java-version.jar')
  18. jversion = subprocess.check_output([java_cmd, "-jar", jar], stderr=subprocess.PIPE)
  19. return int(jversion.strip())
  20. JAVA_VERSION = detect_java_version()
  21. _STANFORD_BASE_URL = "http://nlp.stanford.edu/software/"
  22. if JAVA_VERSION < 8:
  23. # Stanford Core NLP 3.4.1 - Last version to support Java 6 and Java 7
  24. # Pitifully Stanford folks have a public name ("version") of their releases that isn't
  25. # used on their download urls. So, 3.4.1 is "stanford-corenlp-full-2014-08-27"
  26. _CORENLP_VERSION = "stanford-corenlp-full-2014-08-27"
  27. DOWNLOAD_URL = _STANFORD_BASE_URL + _CORENLP_VERSION + ".zip"
  28. DOWNLOAD_URL_ES = _STANFORD_BASE_URL + 'stanford-spanish-corenlp-2014-08-26-models.jar'
  29. DOWNLOAD_URL_DE = _STANFORD_BASE_URL + 'stanford-german-2016-01-19-models.jar'
  30. _FOLDER_PATH = os.path.join(DIRS.user_data_dir, _CORENLP_VERSION)
  31. COMMAND_PATH = os.path.join(_FOLDER_PATH, "corenlp.sh")
  32. else:
  33. # Stanford Core NLP 3.5.2
  34. _CORENLP_VERSION = "stanford-corenlp-full-2015-04-20"
  35. DOWNLOAD_URL_ES = _STANFORD_BASE_URL + 'stanford-spanish-corenlp-2015-01-08-models.jar'
  36. DOWNLOAD_URL_DE = _STANFORD_BASE_URL + 'stanford-german-2016-01-19-models.jar'
  37. DOWNLOAD_URL = _STANFORD_BASE_URL + _CORENLP_VERSION + ".zip"
  38. _FOLDER_PATH = os.path.join(DIRS.user_data_dir, _CORENLP_VERSION)
  39. COMMAND_PATH = os.path.join(_FOLDER_PATH, "corenlp.sh")
  40. @lru_cache(maxsize=1)
  41. def get_analizer(*args, **kwargs):
  42. logger.info("Loading StanfordCoreNLP...")
  43. return StanfordCoreNLP(*args, **kwargs)
  44. class StanfordCoreNLP:
  45. CMD_ARGS = "-outputFormat xml -threads 4"
  46. PROMPT = b"\nNLP> "
  47. def __init__(self, tokenize_with_whitespace=False, gazettes_filepath=None):
  48. cmd_args = self.command_args(tokenize_with_whitespace, gazettes_filepath)
  49. os.chdir(_FOLDER_PATH)
  50. self.corenlp_cmd = [COMMAND_PATH] + cmd_args
  51. self._start_proc()
  52. def _start_proc(self):
  53. self.proc = subprocess.Popen(
  54. self.corenlp_cmd,
  55. stdin=subprocess.PIPE,
  56. stdout=subprocess.PIPE,
  57. stderr=subprocess.STDOUT,
  58. cwd=_FOLDER_PATH
  59. )
  60. self.output = self.iter_output_segments()
  61. self.receive() # Wait until the prompt is ready
  62. def command_args(self, tokenize_with_whitespace, gazettes_filepath):
  63. annotators = ["tokenize", "ssplit", "pos", "lemma", "ner", "parse", "dcoref"]
  64. cmd_args = self.CMD_ARGS[:]
  65. if tokenize_with_whitespace:
  66. cmd_args += " -tokenize.whitespace=true"
  67. if gazettes_filepath:
  68. annotators.insert(annotators.index("ner") + 1, "regexner")
  69. cmd_args += " -regexner.mapping {}".format(gazettes_filepath)
  70. tkn_opts = self._tokenizer_options()
  71. if tkn_opts:
  72. cmd_args += " " + tkn_opts
  73. lang = iepy.instance.settings.IEPY_LANG
  74. edu_mods = "edu/stanford/nlp/models"
  75. if lang == 'es':
  76. annotators.remove('dcoref') # not supported for spanish on Stanford 3.4.1
  77. cmd_args += " -tokenize.language es"
  78. cmd_args += " -pos.model %s/pos-tagger/spanish/spanish-distsim.tagger" % edu_mods
  79. cmd_args += " -ner.model %s/ner/spanish.ancora.distsim.s512.crf.ser.gz" % edu_mods
  80. cmd_args += " -parse.model %s/lexparser/spanishPCFG.ser.gz" % edu_mods
  81. if lang == 'de':
  82. annotators.remove('dcoref') # not supported for german on Stanford 3.4.1
  83. cmd_args += " -tokenize.language de"
  84. cmd_args += " -pos.model %s/pos-tagger/german/german-dewac.tagger" % edu_mods
  85. cmd_args += " -ner.model %s/ner/german.dewac_175m_600.crf.ser.gz" % edu_mods
  86. cmd_args += " -parse.model %s/lexparser/germanPCFG.ser.gz" % edu_mods
  87. cmd_args += " -annotators {}".format(",".join(annotators))
  88. return cmd_args.split()
  89. def _tokenizer_options(self):
  90. """As stated in
  91. http://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/process/PTBTokenizer.html
  92. there are several tokenizer options that can be changed.
  93. We'll only send to command line those that differ from the Stanford default.
  94. """
  95. extra_keys = ['ptb3Escaping']
  96. defaults = {
  97. 'invertible': False,
  98. 'tokenizeNLs': False,
  99. 'americanize': True,
  100. 'normalizeSpace': True,
  101. 'normalizeAmpersandEntity': True,
  102. 'normalizeCurrency': True,
  103. 'normalizeFractions': True,
  104. 'normalizeParentheses': True,
  105. 'normalizeOtherBrackets': True,
  106. 'asciiQuotes': False,
  107. 'latexQuotes': True,
  108. 'unicodeQuotes': False,
  109. 'ptb3Ellipsis': True,
  110. 'unicodeEllipsis': False,
  111. 'ptb3Dashes': True,
  112. 'keepAssimilations': True,
  113. 'escapeForwardSlashAsterisk': True,
  114. 'untokenizable': "firstDelete",
  115. 'strictTreebank3': False
  116. }
  117. allowed_keys = set(defaults.keys()).union(extra_keys)
  118. customizations = getattr(iepy.instance.settings, 'CORENLP_TKN_OPTS', {})
  119. opts = []
  120. for k, v in customizations.items():
  121. if k not in allowed_keys:
  122. raise ValueError('Invalid key "%s". Valid options are %s' % (k, allowed_keys))
  123. if k in defaults and defaults[k] == v:
  124. # valid option, but it's the defaults, so no need to provide it.
  125. continue
  126. if isinstance(v, bool):
  127. v = ("%s" % v).lower()
  128. opts.append("%s=%s" % (k, v))
  129. if opts:
  130. return '-tokenize.options "{}"'.format(','.join(opts))
  131. def iter_output_segments(self):
  132. while True:
  133. buf = b""
  134. while self.PROMPT not in buf:
  135. buf += self.proc.stdout.read1(1024)
  136. if self.proc.poll() == 1:
  137. logger.error("Error running '{}'".format(" ".join(self.corenlp_cmd)))
  138. logger.error("Output was: '{}'".format(buf))
  139. sys.exit(1)
  140. segment, _, buf = buf.partition(self.PROMPT)
  141. yield segment.decode("utf8")
  142. def receive(self):
  143. return next(self.output)
  144. def send(self, data):
  145. data = data.replace("\n", " ") + "\n"
  146. self.proc.stdin.write(data.encode("utf8"))
  147. self.proc.stdin.flush()
  148. def quit(self):
  149. self.proc.stdin.write("q\n".encode("utf8"))
  150. self.proc.stdin.flush()
  151. @lru_cache(maxsize=1)
  152. def analyse(self, text):
  153. self.send(text)
  154. text = self.receive()
  155. i = text.index("<?xml version")
  156. text = text[i:]
  157. return xmltodict.parse(text)["root"]["document"]
  158. def download(lang='en'):
  159. base = os.path.dirname(COMMAND_PATH)
  160. if os.path.isfile(COMMAND_PATH):
  161. print("Stanford CoreNLP is already downloaded at {}.".format(base))
  162. else:
  163. print("Downloading Stanford CoreNLP...")
  164. unzip_from_url(DOWNLOAD_URL, DIRS.user_data_dir)
  165. # Zip acquired. Make sure right Java is used, and file is executable
  166. for directory in os.listdir(DIRS.user_data_dir):
  167. if directory.startswith("stanford-corenlp-full"):
  168. stanford_directory = os.path.join(DIRS.user_data_dir, directory)
  169. if os.path.isdir(stanford_directory):
  170. runner_path = os.path.join(stanford_directory, "corenlp.sh")
  171. st = os.stat(runner_path)
  172. _content = open(runner_path).read()
  173. _content = _content.replace('java', '$JAVAHOME')
  174. with open(runner_path, 'w') as runner_file:
  175. runner_file.write(_content)
  176. os.chmod(runner_path, st.st_mode | stat.S_IEXEC)
  177. break
  178. # Download extra data for specific language
  179. download_urls = dict(es=DOWNLOAD_URL_ES, de=DOWNLOAD_URL_DE)
  180. if lang.lower() in download_urls.keys():
  181. print("Downloading Stanford CoreNLP extra data for lang '{}'...".format(lang))
  182. unzip_from_url(download_urls[lang.lower()], _FOLDER_PATH)
  183. elif lang.lower() != 'en':
  184. print("There are no extra data to download for lang '{}'.".format(lang))