stanford.py 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637
  1. #!/usr/bin/env python
  2. """Conversion scripts related to Stanford tools.
  3. Author: Pontus Stenetorp <pontus stenetorp se>
  4. Version: 2012-06-26
  5. """
  6. # TODO: Currently pretty much every single call re-parses the XML, optimise?
  7. # TODO: We could potentially put the lemma into a comment
  8. from collections import defaultdict
  9. from itertools import chain
  10. from os.path import join as path_join
  11. from os.path import dirname
  12. from sys import path as sys_path
  13. from sys import argv, stderr, stdout
  14. from xml.etree import ElementTree
  15. from .ptbesc import unescape as ptb_unescape
  16. try:
  17. from collections import namedtuple
  18. except ImportError:
  19. sys_path.append(path_join(dirname(__file__), '..', '..', 'lib'))
  20. from altnamedtuple import namedtuple
  21. try:
  22. from annotation import (BinaryRelationAnnotation, EquivAnnotation,
  23. TextBoundAnnotation)
  24. except ImportError:
  25. sys_path.append(path_join(dirname(__file__), '..'))
  26. from annotation import (BinaryRelationAnnotation, EquivAnnotation,
  27. TextBoundAnnotation)
  28. Token = namedtuple('Token', ('word', 'lemma', 'start', 'end', 'pos', 'ner', ))
  29. def _escape_pos_tags(pos):
  30. pos_res = pos
  31. for _from, to in (
  32. ("'", '__SINGLEQUOTE__', ),
  33. ('"', '__DOUBLEQUOTE__', ),
  34. ('$', '__DOLLAR__', ),
  35. (',', '__COMMA__', ),
  36. ('.', '__DOT__', ),
  37. (':', '__COLON__', ),
  38. ('`', '__BACKTICK__', ),
  39. ):
  40. pos_res = pos_res.replace(_from, to)
  41. return pos_res
  42. def _token_by_ids(soup):
  43. token_by_ids = defaultdict(dict)
  44. for sent_e in _find_sentences_element(soup).getiterator('sentence'):
  45. sent_id = int(sent_e.get('id'))
  46. for tok_e in sent_e.getiterator('token'):
  47. tok_id = int(tok_e.get('id'))
  48. tok_word = str(tok_e.find('word').text)
  49. tok_lemma = str(tok_e.find('lemma').text)
  50. tok_start = int(tok_e.find('CharacterOffsetBegin').text)
  51. tok_end = int(tok_e.find('CharacterOffsetEnd').text)
  52. tok_pos = str(tok_e.find('POS').text)
  53. tok_ner = str(tok_e.find('NER').text)
  54. token_by_ids[sent_id][tok_id] = Token(
  55. word=tok_word,
  56. lemma=tok_lemma,
  57. start=tok_start,
  58. end=tok_end,
  59. # Escape the PoS since brat dislike $ and .
  60. pos=_escape_pos_tags(tok_pos),
  61. ner=tok_ner
  62. )
  63. return token_by_ids
  64. def _tok_it(token_by_ids):
  65. for s_id in sorted(k for k in token_by_ids):
  66. for t_id in sorted(k for k in token_by_ids[s_id]):
  67. yield s_id, t_id, token_by_ids[s_id][t_id]
  68. def _soup(xml):
  69. return ElementTree.fromstring(xml)
  70. def token_offsets(xml):
  71. soup = _soup(xml)
  72. token_by_ids = _token_by_ids(soup)
  73. return [(tok.start, tok.end) for _, _, tok in _tok_it(token_by_ids)]
  74. def sentence_offsets(xml):
  75. soup = _soup(xml)
  76. token_by_ids = _token_by_ids(soup)
  77. sent_min_max = defaultdict(lambda: (2**32, -1, ))
  78. for s_id, _, tok in _tok_it(token_by_ids):
  79. s_entry = sent_min_max[s_id]
  80. sent_min_max[s_id] = (min(tok.start, s_entry[0]),
  81. max(tok.end, s_entry[1]), )
  82. return sorted((s_start, s_end)
  83. for s_start, s_end in sent_min_max.values())
  84. def text(xml):
  85. # It would be nice to have access to the original text, but this actually
  86. # isn't a part of the XML. Constructing it isn't that easy either, you
  87. # would have to assume that each "missing" character is a space, but you
  88. # don't really have any guarantee that this is the case...
  89. soup = _soup(xml)
  90. token_by_ids = _token_by_ids(soup)
  91. # Get the presumed length of the text
  92. max_offset = -1
  93. for _, _, tok in _tok_it(token_by_ids):
  94. max_offset = max(max_offset, tok.end)
  95. # Then re-construct what we believe the text to be
  96. text = list(' ' * max_offset)
  97. for _, _, tok in _tok_it(token_by_ids):
  98. # Also unescape any PTB escapes in the text while we are at it
  99. # Note: Since Stanford actually doesn't do all the escapings properly
  100. # this will sometimes fail! Hint: Try "*/\*".
  101. unesc_word = ptb_unescape(tok.word)
  102. text[tok.start:len(unesc_word)] = unesc_word
  103. return ''.join(text)
  104. def _pos(xml, start_id=1):
  105. soup = _soup(xml)
  106. token_by_ids = _token_by_ids(soup)
  107. curr_id = start_id
  108. for s_id, t_id, tok in _tok_it(token_by_ids):
  109. yield s_id, t_id, TextBoundAnnotation(((tok.start, tok.end, ), ),
  110. 'T%s' % curr_id, tok.pos, '')
  111. curr_id += 1
  112. def pos(xml, start_id=1):
  113. return (a for _, _, a in _pos(xml, start_id=start_id))
  114. def ner(xml, start_id=1):
  115. soup = _soup(xml)
  116. token_by_ids = _token_by_ids(soup)
  117. # Stanford only has Inside and Outside tags, so conversion is easy
  118. nes = []
  119. last_ne_tok = None
  120. prev_tok = None
  121. for _, _, tok in _tok_it(token_by_ids):
  122. if tok.ner != 'O':
  123. if last_ne_tok is None:
  124. # Start of an NE from nothing
  125. last_ne_tok = tok
  126. elif tok.ner != last_ne_tok.ner:
  127. # Change in NE type
  128. nes.append(
  129. (last_ne_tok.start, prev_tok.end, last_ne_tok.ner, ))
  130. last_ne_tok = tok
  131. else:
  132. # Continuation of the last NE, move along
  133. pass
  134. elif last_ne_tok is not None:
  135. # NE ended
  136. nes.append((last_ne_tok.start, prev_tok.end, last_ne_tok.ner, ))
  137. last_ne_tok = None
  138. prev_tok = tok
  139. else:
  140. # Do we need to terminate the last named entity?
  141. if last_ne_tok is not None:
  142. nes.append((last_ne_tok.start, prev_tok.end, last_ne_tok.ner, ))
  143. curr_id = start_id
  144. for start, end, _type in nes:
  145. yield TextBoundAnnotation(((start, end), ), 'T%s' % curr_id, _type, '')
  146. curr_id += 1
  147. def coref(xml, start_id=1):
  148. soup = _soup(xml)
  149. token_by_ids = _token_by_ids(soup)
  150. docs_e = soup.findall('document')
  151. assert len(docs_e) == 1
  152. docs_e = docs_e[0]
  153. # Despite the name, this element contains conferences (note the "s")
  154. corefs_e = docs_e.findall('coreference')
  155. if not corefs_e:
  156. # No coreferences to process
  157. raise StopIteration
  158. assert len(corefs_e) == 1
  159. corefs_e = corefs_e[0]
  160. curr_id = start_id
  161. for coref_e in corefs_e:
  162. if corefs_e.tag != 'coreference':
  163. # To be on the safe side
  164. continue
  165. # This tag is now a full corference chain
  166. chain = []
  167. for mention_e in coref_e.getiterator('mention'):
  168. # Note: There is a "representative" attribute signalling the most
  169. # "suitable" mention, we are currently not using this
  170. # Note: We don't use the head information for each mention
  171. sentence_id = int(mention_e.find('sentence').text)
  172. start_tok_id = int(mention_e.find('start').text)
  173. end_tok_id = int(mention_e.find('end').text) - 1
  174. mention_id = 'T%s' % (curr_id, )
  175. chain.append(mention_id)
  176. curr_id += 1
  177. yield TextBoundAnnotation(
  178. ((token_by_ids[sentence_id][start_tok_id].start,
  179. token_by_ids[sentence_id][end_tok_id].end), ),
  180. mention_id, 'Mention', '')
  181. yield EquivAnnotation('Coreference', chain, '')
  182. def _find_sentences_element(soup):
  183. # Find the right portion of the XML and do some limited sanity checking
  184. docs_e = soup.findall('document')
  185. assert len(docs_e) == 1
  186. docs_e = docs_e[0]
  187. sents_e = docs_e.findall('sentences')
  188. assert len(sents_e) == 1
  189. sents_e = sents_e[0]
  190. return sents_e
  191. def _dep(xml, source_element='basic-dependencies'):
  192. soup = _soup(xml)
  193. _token_by_ids(soup)
  194. ann_by_ids = defaultdict(dict)
  195. for s_id, t_id, ann in _pos(xml):
  196. ann_by_ids[s_id][t_id] = ann
  197. yield ann
  198. curr_rel_id = 1
  199. for sent_e in _find_sentences_element(soup).getiterator('sentence'):
  200. sent_id = int(sent_e.get('id'))
  201. # Attempt to find dependencies as distinctly named elements as they
  202. # were stored in the Stanford XML format prior to 2013.
  203. deps_e = sent_e.findall(source_element)
  204. if len(deps_e) == 0:
  205. # Perhaps we are processing output following the newer standard,
  206. # check for the same identifier but as a type attribute for
  207. # general "dependencies" elements.
  208. deps_e = list(e for e in sent_e.getiterator('dependencies')
  209. if e.attrib['type'] == source_element)
  210. assert len(deps_e) == 1
  211. deps_e = deps_e[0]
  212. for dep_e in deps_e:
  213. if dep_e.tag != 'dep':
  214. # To be on the safe side
  215. continue
  216. dep_type = dep_e.get('type')
  217. assert dep_type is not None
  218. if dep_type == 'root':
  219. # Skip dependencies to the root node, this behaviour conforms
  220. # with how we treated the pre-2013 format.
  221. continue
  222. gov_tok_id = int(dep_e.find('governor').get('idx'))
  223. dep_tok_id = int(dep_e.find('dependent').get('idx'))
  224. yield BinaryRelationAnnotation(
  225. 'R%s' % curr_rel_id, dep_type,
  226. 'Governor', ann_by_ids[sent_id][gov_tok_id].id,
  227. 'Dependent', ann_by_ids[sent_id][dep_tok_id].id,
  228. ''
  229. )
  230. curr_rel_id += 1
  231. def basic_dep(xml):
  232. return _dep(xml)
  233. def collapsed_dep(xml):
  234. return _dep(xml, source_element='collapsed-dependencies')
  235. def collapsed_ccproc_dep(xml):
  236. return _dep(xml, source_element='collapsed-ccprocessed-dependencies')
  237. if __name__ == '__main__':
  238. STANFORD_XML = '''<?xml version="1.0" encoding="UTF-8"?>
  239. <?xml-stylesheet href="CoreNLP-to-HTML.xsl" type="text/xsl"?>
  240. <root>
  241. <document>
  242. <sentences>
  243. <sentence id="1">
  244. <tokens>
  245. <token id="1">
  246. <word>Stanford</word>
  247. <lemma>Stanford</lemma>
  248. <CharacterOffsetBegin>0</CharacterOffsetBegin>
  249. <CharacterOffsetEnd>8</CharacterOffsetEnd>
  250. <POS>NNP</POS>
  251. <NER>ORGANIZATION</NER>
  252. </token>
  253. <token id="2">
  254. <word>University</word>
  255. <lemma>University</lemma>
  256. <CharacterOffsetBegin>9</CharacterOffsetBegin>
  257. <CharacterOffsetEnd>19</CharacterOffsetEnd>
  258. <POS>NNP</POS>
  259. <NER>ORGANIZATION</NER>
  260. </token>
  261. <token id="3">
  262. <word>is</word>
  263. <lemma>be</lemma>
  264. <CharacterOffsetBegin>20</CharacterOffsetBegin>
  265. <CharacterOffsetEnd>22</CharacterOffsetEnd>
  266. <POS>VBZ</POS>
  267. <NER>O</NER>
  268. </token>
  269. <token id="4">
  270. <word>located</word>
  271. <lemma>located</lemma>
  272. <CharacterOffsetBegin>23</CharacterOffsetBegin>
  273. <CharacterOffsetEnd>30</CharacterOffsetEnd>
  274. <POS>JJ</POS>
  275. <NER>O</NER>
  276. </token>
  277. <token id="5">
  278. <word>in</word>
  279. <lemma>in</lemma>
  280. <CharacterOffsetBegin>31</CharacterOffsetBegin>
  281. <CharacterOffsetEnd>33</CharacterOffsetEnd>
  282. <POS>IN</POS>
  283. <NER>O</NER>
  284. </token>
  285. <token id="6">
  286. <word>California</word>
  287. <lemma>California</lemma>
  288. <CharacterOffsetBegin>34</CharacterOffsetBegin>
  289. <CharacterOffsetEnd>44</CharacterOffsetEnd>
  290. <POS>NNP</POS>
  291. <NER>LOCATION</NER>
  292. </token>
  293. <token id="7">
  294. <word>.</word>
  295. <lemma>.</lemma>
  296. <CharacterOffsetBegin>44</CharacterOffsetBegin>
  297. <CharacterOffsetEnd>45</CharacterOffsetEnd>
  298. <POS>.</POS>
  299. <NER>O</NER>
  300. </token>
  301. </tokens>
  302. <parse>(ROOT (S (NP (NNP Stanford) (NNP University)) (VP (VBZ is) (ADJP (JJ located) (PP (IN in) (NP (NNP California))))) (. .))) </parse>
  303. <basic-dependencies>
  304. <dep type="nn">
  305. <governor idx="2">University</governor>
  306. <dependent idx="1">Stanford</dependent>
  307. </dep>
  308. <dep type="nsubj">
  309. <governor idx="4">located</governor>
  310. <dependent idx="2">University</dependent>
  311. </dep>
  312. <dep type="cop">
  313. <governor idx="4">located</governor>
  314. <dependent idx="3">is</dependent>
  315. </dep>
  316. <dep type="prep">
  317. <governor idx="4">located</governor>
  318. <dependent idx="5">in</dependent>
  319. </dep>
  320. <dep type="pobj">
  321. <governor idx="5">in</governor>
  322. <dependent idx="6">California</dependent>
  323. </dep>
  324. </basic-dependencies>
  325. <collapsed-dependencies>
  326. <dep type="nn">
  327. <governor idx="2">University</governor>
  328. <dependent idx="1">Stanford</dependent>
  329. </dep>
  330. <dep type="nsubj">
  331. <governor idx="4">located</governor>
  332. <dependent idx="2">University</dependent>
  333. </dep>
  334. <dep type="cop">
  335. <governor idx="4">located</governor>
  336. <dependent idx="3">is</dependent>
  337. </dep>
  338. <dep type="prep_in">
  339. <governor idx="4">located</governor>
  340. <dependent idx="6">California</dependent>
  341. </dep>
  342. </collapsed-dependencies>
  343. <collapsed-ccprocessed-dependencies>
  344. <dep type="nn">
  345. <governor idx="2">University</governor>
  346. <dependent idx="1">Stanford</dependent>
  347. </dep>
  348. <dep type="nsubj">
  349. <governor idx="4">located</governor>
  350. <dependent idx="2">University</dependent>
  351. </dep>
  352. <dep type="cop">
  353. <governor idx="4">located</governor>
  354. <dependent idx="3">is</dependent>
  355. </dep>
  356. <dep type="prep_in">
  357. <governor idx="4">located</governor>
  358. <dependent idx="6">California</dependent>
  359. </dep>
  360. </collapsed-ccprocessed-dependencies>
  361. </sentence>
  362. <sentence id="2">
  363. <tokens>
  364. <token id="1">
  365. <word>It</word>
  366. <lemma>it</lemma>
  367. <CharacterOffsetBegin>46</CharacterOffsetBegin>
  368. <CharacterOffsetEnd>48</CharacterOffsetEnd>
  369. <POS>PRP</POS>
  370. <NER>O</NER>
  371. </token>
  372. <token id="2">
  373. <word>is</word>
  374. <lemma>be</lemma>
  375. <CharacterOffsetBegin>49</CharacterOffsetBegin>
  376. <CharacterOffsetEnd>51</CharacterOffsetEnd>
  377. <POS>VBZ</POS>
  378. <NER>O</NER>
  379. </token>
  380. <token id="3">
  381. <word>a</word>
  382. <lemma>a</lemma>
  383. <CharacterOffsetBegin>52</CharacterOffsetBegin>
  384. <CharacterOffsetEnd>53</CharacterOffsetEnd>
  385. <POS>DT</POS>
  386. <NER>O</NER>
  387. </token>
  388. <token id="4">
  389. <word>great</word>
  390. <lemma>great</lemma>
  391. <CharacterOffsetBegin>54</CharacterOffsetBegin>
  392. <CharacterOffsetEnd>59</CharacterOffsetEnd>
  393. <POS>JJ</POS>
  394. <NER>O</NER>
  395. </token>
  396. <token id="5">
  397. <word>university</word>
  398. <lemma>university</lemma>
  399. <CharacterOffsetBegin>60</CharacterOffsetBegin>
  400. <CharacterOffsetEnd>70</CharacterOffsetEnd>
  401. <POS>NN</POS>
  402. <NER>O</NER>
  403. </token>
  404. <token id="6">
  405. <word>.</word>
  406. <lemma>.</lemma>
  407. <CharacterOffsetBegin>70</CharacterOffsetBegin>
  408. <CharacterOffsetEnd>71</CharacterOffsetEnd>
  409. <POS>.</POS>
  410. <NER>O</NER>
  411. </token>
  412. </tokens>
  413. <parse>(ROOT (S (NP (PRP It)) (VP (VBZ is) (NP (DT a) (JJ great) (NN university))) (. .))) </parse>
  414. <basic-dependencies>
  415. <dep type="nsubj">
  416. <governor idx="5">university</governor>
  417. <dependent idx="1">It</dependent>
  418. </dep>
  419. <dep type="cop">
  420. <governor idx="5">university</governor>
  421. <dependent idx="2">is</dependent>
  422. </dep>
  423. <dep type="det">
  424. <governor idx="5">university</governor>
  425. <dependent idx="3">a</dependent>
  426. </dep>
  427. <dep type="amod">
  428. <governor idx="5">university</governor>
  429. <dependent idx="4">great</dependent>
  430. </dep>
  431. </basic-dependencies>
  432. <collapsed-dependencies>
  433. <dep type="nsubj">
  434. <governor idx="5">university</governor>
  435. <dependent idx="1">It</dependent>
  436. </dep>
  437. <dep type="cop">
  438. <governor idx="5">university</governor>
  439. <dependent idx="2">is</dependent>
  440. </dep>
  441. <dep type="det">
  442. <governor idx="5">university</governor>
  443. <dependent idx="3">a</dependent>
  444. </dep>
  445. <dep type="amod">
  446. <governor idx="5">university</governor>
  447. <dependent idx="4">great</dependent>
  448. </dep>
  449. </collapsed-dependencies>
  450. <collapsed-ccprocessed-dependencies>
  451. <dep type="nsubj">
  452. <governor idx="5">university</governor>
  453. <dependent idx="1">It</dependent>
  454. </dep>
  455. <dep type="cop">
  456. <governor idx="5">university</governor>
  457. <dependent idx="2">is</dependent>
  458. </dep>
  459. <dep type="det">
  460. <governor idx="5">university</governor>
  461. <dependent idx="3">a</dependent>
  462. </dep>
  463. <dep type="amod">
  464. <governor idx="5">university</governor>
  465. <dependent idx="4">great</dependent>
  466. </dep>
  467. </collapsed-ccprocessed-dependencies>
  468. </sentence>
  469. </sentences>
  470. <coreference>
  471. <coreference>
  472. <mention representative="true">
  473. <sentence>1</sentence>
  474. <start>1</start>
  475. <end>3</end>
  476. <head>2</head>
  477. </mention>
  478. <mention>
  479. <sentence>2</sentence>
  480. <start>1</start>
  481. <end>2</end>
  482. <head>1</head>
  483. </mention>
  484. <mention>
  485. <sentence>2</sentence>
  486. <start>3</start>
  487. <end>6</end>
  488. <head>5</head>
  489. </mention>
  490. </coreference>
  491. </coreference>
  492. </document>
  493. </root>
  494. '''
  495. def _test_xml(xml_string):
  496. stdout.write('Text:\n')
  497. stdout.write(text(xml_string))
  498. stdout.write('\n')
  499. stdout.write('\n')
  500. stdout.write('Part-of-speech:\n')
  501. for ann in pos(xml_string):
  502. stdout.write(str(ann))
  503. stdout.write('\n')
  504. stdout.write('\n')
  505. stdout.write('Named Entity Recoginiton:\n')
  506. for ann in ner(xml_string):
  507. stdout.write(str(ann))
  508. stdout.write('\n')
  509. stdout.write('\n')
  510. stdout.write('Co-reference:\n')
  511. for ann in coref(xml_string):
  512. stdout.write(str(ann))
  513. stdout.write('\n')
  514. stdout.write('\n')
  515. stdout.write('Basic dependencies:\n')
  516. for ann in basic_dep(xml_string):
  517. stdout.write(str(ann))
  518. stdout.write('\n')
  519. stdout.write('\n')
  520. stdout.write('Basic dependencies:\n')
  521. for ann in basic_dep(xml_string):
  522. stdout.write(str(ann))
  523. stdout.write('\n')
  524. stdout.write('\n')
  525. stdout.write('Collapsed dependencies:\n')
  526. for ann in collapsed_dep(xml_string):
  527. stdout.write(str(ann))
  528. stdout.write('\n')
  529. stdout.write('\n')
  530. stdout.write('Collapsed CC-processed dependencies:\n')
  531. for ann in collapsed_ccproc_dep(xml_string):
  532. stdout.write(str(ann))
  533. stdout.write('\n')
  534. stdout.write('\n')
  535. stdout.write('Token boundaries:\n')
  536. stdout.write(str(token_offsets(xml_string)))
  537. stdout.write('\n')
  538. stdout.write('\n')
  539. stdout.write('Sentence boundaries:\n')
  540. stdout.write(str(sentence_offsets(xml_string)))
  541. stdout.write('\n')
  542. if len(argv) < 2:
  543. xml_strings = (('<string>', STANFORD_XML), )
  544. else:
  545. def _xml_gen():
  546. for xml_path in argv[1:]:
  547. with open(xml_path, 'r') as xml_file:
  548. # We assume UTF-8 here, otherwise ElemenTree will bork
  549. yield (xml_path, xml_file.read().decode('utf-8'))
  550. xml_strings = _xml_gen()
  551. for xml_source, xml_string in xml_strings:
  552. try:
  553. print(xml_source, file=stderr)
  554. _test_xml(xml_string)
  555. except BaseException:
  556. print('Crashed on:', xml_source, file=stderr)
  557. raise