rogel il y a 5 ans
commit
ec7973e324
100 fichiers modifiés avec 5644 ajouts et 0 suppressions
  1. 3 0
      .gitignore
  2. 14 0
      AUTHORS
  3. 44 0
      ChangeLog
  4. 27 0
      LICENSE
  5. 30 0
      LICENSE_details.txt
  6. 15 0
      MANIFEST.in
  7. 88 0
      README.rst
  8. 1 0
      docs/Changelog
  9. 153 0
      docs/Makefile
  10. 151 0
      docs/active_learning_tutorial.rst
  11. 244 0
      docs/conf.py
  12. 105 0
      docs/corpus_labeling.rst
  13. BIN
      docs/create_eo.png
  14. 45 0
      docs/gazettes.rst
  15. BIN
      docs/home_screenshot.png
  16. 192 0
      docs/how_to_hack.rst
  17. 0 0
      docs/iepy_1.svg
  18. 0 0
      docs/iepy_2.svg
  19. 0 0
      docs/iepy_3.svg
  20. 0 0
      docs/iepy_4.svg
  21. 85 0
      docs/index.rst
  22. 79 0
      docs/installation.rst
  23. 137 0
      docs/instantiation.rst
  24. BIN
      docs/label_by_document_entity_edition.png
  25. BIN
      docs/label_by_document_relation_labeled.png
  26. BIN
      docs/label_by_document_screenshot.png
  27. BIN
      docs/label_by_segment_screenshot.png
  28. BIN
      docs/labels_by_iepy.png
  29. 60 0
      docs/language.rst
  30. 231 0
      docs/preprocess.rst
  31. 188 0
      docs/rules_tutorial.rst
  32. 2 0
      docs/setup/requirements-base.txt
  33. 3 0
      docs/setup/requirements-development.txt
  34. 0 0
      docs/setup/system_packages.txt
  35. 4 0
      docs/setup/third_party.txt
  36. 38 0
      docs/troubleshooting.rst
  37. 110 0
      docs/tutorial.rst
  38. 19 0
      docs/virtualenv.rst
  39. 7 0
      examples/birthdate/scripts/create_birthdate_relation.py
  40. 47 0
      examples/birthdate/scripts/csv_to_iepy.py
  41. 34 0
      examples/birthdate/scripts/preprocess.py
  42. 42 0
      examples/birthdate/settings.py
  43. 122 0
      examples/birthdate/was_born_rules_sample.py
  44. 1 0
      examples/credit/__init__.py
  45. 49 0
      examples/credit/annotation.conf
  46. 7 0
      examples/credit/articles.csv
  47. 28 0
      examples/credit/bin/csv_to_iepy.py
  48. 76 0
      examples/credit/bin/gazettes_loader.py
  49. 59 0
      examples/credit/bin/iepy_rules_runner.py
  50. 184 0
      examples/credit/bin/iepy_runner.py
  51. 12 0
      examples/credit/bin/manage.py
  52. 96 0
      examples/credit/bin/preprocess.py
  53. 149 0
      examples/credit/bin/rules_verifier.py
  54. 241 0
      examples/credit/bin/settlement.py
  55. 20 0
      examples/credit/extractor_config.json
  56. 6 0
      examples/credit/format.py
  57. 2 0
      examples/credit/rules.py
  58. 182 0
      examples/credit/settings.py
  59. BIN
      examples/credit/test.sqlite
  60. 1 0
      examples/product/__init__.py
  61. 49 0
      examples/product/annotation.conf
  62. 7 0
      examples/product/articles.csv
  63. 28 0
      examples/product/bin/csv_to_iepy.py
  64. 76 0
      examples/product/bin/gazettes_loader.py
  65. 59 0
      examples/product/bin/iepy_rules_runner.py
  66. 184 0
      examples/product/bin/iepy_runner.py
  67. 12 0
      examples/product/bin/manage.py
  68. 96 0
      examples/product/bin/preprocess.py
  69. 149 0
      examples/product/bin/rules_verifier.py
  70. 241 0
      examples/product/bin/settlement.py
  71. 20 0
      examples/product/extractor_config.json
  72. 6 0
      examples/product/format.py
  73. 188 0
      examples/product/product_article.csv
  74. 2 0
      examples/product/rules.py
  75. 182 0
      examples/product/settings.py
  76. BIN
      examples/product/test.sqlite
  77. 1 0
      examples/test/__init__.py
  78. 49 0
      examples/test/annotation.conf
  79. 7 0
      examples/test/articles.csv
  80. BIN
      examples/test/bin/2020-08-01-2020-08-31要素标注统计.xls
  81. BIN
      examples/test/bin/None-2020-09-25要素标注统计.xls
  82. BIN
      examples/test/bin/None-2020-10-31要素标注统计.xls
  83. BIN
      examples/test/bin/None-2020-11-25要素标注统计.xls
  84. BIN
      examples/test/bin/None-2020-12-25要素标注统计.xls
  85. 28 0
      examples/test/bin/csv_to_iepy.py
  86. 76 0
      examples/test/bin/gazettes_loader.py
  87. 59 0
      examples/test/bin/iepy_rules_runner.py
  88. 184 0
      examples/test/bin/iepy_runner.py
  89. 12 0
      examples/test/bin/manage.py
  90. 96 0
      examples/test/bin/preprocess.py
  91. 149 0
      examples/test/bin/rules_verifier.py
  92. 251 0
      examples/test/bin/settlement.py
  93. BIN
      examples/test/bin/分组_1.xls
  94. BIN
      examples/test/bin/分组_10.xls
  95. BIN
      examples/test/bin/分组_2.xls
  96. BIN
      examples/test/bin/分组_3.xls
  97. BIN
      examples/test/bin/分组_4.xls
  98. BIN
      examples/test/bin/分组_5.xls
  99. BIN
      examples/test/bin/分组_6.xls
  100. BIN
      examples/test/bin/分组_7.xls

+ 3 - 0
.gitignore

@@ -0,0 +1,3 @@
+*.pyc
+iepy.egg-info
+/.idea/

+ 14 - 0
AUTHORS

@@ -0,0 +1,14 @@
+Authors
+
+- Rafael Carrascosa <rcarrascosa@machinalis.com> (rafacarrascosa at github)
+- Javier Mansilla <jmansilla@machinalis.com> (jmansilla at github)
+- Gonzalo García Berrotarán <ggarcia@machinalis.com> (j0hn at github)
+- Daniel Moisset <dmoisset@machinalis.com> (dmoisset at github)
+- Franco M. Luque <francolq@famaf.unc.edu.ar> (francolq at github)
+
+Contributors
+
+- Marcos Spontón (msponton@machinalis.com)
+- Laura Alonso i Alemany (lalonsoialemany@machinalis.com)
+- Patricio Del Boca (pdelboca@machinalis.com)
+- Elías Andrawos (eandrawos@machinalis.com)

+ 44 - 0
ChangeLog

@@ -0,0 +1,44 @@
+0.9.6
+    - Fixed some dependencies declarations to provide support for python 3.5
+    - Bug fix respect to active learning predictions
+    - Added support for German preprocess (thanks @sweh)
+
+0.9.5
+    - Bug fix on TokenizerSentencerRunner (thanks ezesalta)
+    - Fix on installation dependencies
+    - Tokenization options can be handled from instance settings file
+
+0.9.4
+    - Added multicore preprocess
+    - Added support for Stanford 3.5.2 preprocess models
+
+0.9.3
+    - Added grammatical parsing to the preprocess flow of documents
+    - Added support for Spanish preprocess
+    - Restricted each iepy-instance to a single language
+    - Gazetter support
+    - Labeling UI improvements
+    - Performance and memory usage improvements
+    - Model simplifications (labels, metadata)
+    - Storage & view of predictions
+
+0.9.2
+    - Add ability to use custom features (http://iepy.rtfd.org/en/latest/how_to_hack.html#implementing-your-own-features)
+    - Add ability to use rules as features (http://iepy.rtfd.org/en/latest/how_to_hack.html#using-rules-as-features)
+    - Add rules verifier (http://iepy.rtfd.org/en/latest/rules_tutorial.html#verifying-your-rules)
+    - Fixed bugs of compatibility with firefox [thanks dchaplinsky for the bug report]
+    - Skip instead of crashing when a document could not be loaded via csv importer [thanks dchaplinsky for the report and suggestion]
+    - Performance improvement on rules runner
+    - Change instance files schema, now it's a python package and renamed settings.
+    - Add lemmatization to the pre-process (http://iepy.rtfd.org/en/latest/preprocess.html#lemmatization)
+    - Fix critical bug on loading rules
+    - Fix critical bug on ranking questions on the active learning extraction runner
+
+0.9.1
+    - Add entity kind on the modal dialog
+    - Change arrows display to be more understandable
+    - Join skip and don't know label options
+    - Change options dropdown for radio buttons
+    - Show help for shortcuts and change the order of the options
+    - Documents rich view (without needing to be labeling the document for some relation)
+    - instance upgrader

+ 27 - 0
LICENSE

@@ -0,0 +1,27 @@
+Copyright (c) Machinalis and individual contributors.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+    1. Redistributions of source code must retain the above copyright notice,
+       this list of conditions and the following disclaimer.
+
+    2. Redistributions in binary form must reproduce the above copyright
+       notice, this list of conditions and the following disclaimer in the
+       documentation and/or other materials provided with the distribution.
+
+    3. Neither the name of Machinalis nor the names of its contributors may be
+       used to endorse or promote products derived from this software without
+       specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

+ 30 - 0
LICENSE_details.txt

@@ -0,0 +1,30 @@
+The python libraries (and their licenses) that we are explicitly depending on,
+are the following ones,
+
+    - nltk (Apache License)
+    - numpy (BSD)
+    - scipy (BSD)
+    - scikit-learn (BSD)
+    - mock (BSD)
+    - docopt (MIT)
+    - future (MIT)
+    - appdirs (MIT)
+    - wget (Public Domain)
+    - colorama (BSD)
+    - featureforge (BSD)
+
+The development tools we are using:
+
+    - nose (LGPL)
+    - factory-boy (MIT)
+
+Additionally, in order to be able to create your own iepy-ready corpus with our
+preprocessing tools, you'll need to download the following things that are not
+provided by this software
+
+    - punkt tokenizer (acquirable with the NLTK downloader or the
+                       download_third_party_data script)
+    - wordnet (acquirable with the NLTK downloader or the
+               download_third_party_data script)
+    - GPL Stanford CoreNLP (acquirable with download_third_party_data script)
+    - GPL Stanford Spanish Models (acquirable with download_third_party_data script)

+ 15 - 0
MANIFEST.in

@@ -0,0 +1,15 @@
+include README.rst
+include AUTHORS
+include LICENSE
+include MANIFEST.in
+include ChangeLog
+include iepy/version.txt
+
+recursive-include iepy/instantiation *.template
+recursive-include iepy/preprocess/utils *.jar
+recursive-include iepy/webui/corpus/static *
+recursive-include iepy/webui/corpus/templates *
+recursive-include docs/setup requirements*.txt
+
+recursive-exclude * __pycache__
+recursive-exclude * *.py[co]

+ 88 - 0
README.rst

@@ -0,0 +1,88 @@
+IEPY
+====
+
+IEPY is an open source tool for
+`Information Extraction <http://en.wikipedia.org/wiki/Information_extraction>`_
+focused on Relation Extraction.
+
+To give an example of Relation Extraction, if we are trying to find a
+birth date in:
+
+    `"John von Neumann (December 28, 1903 – February 8, 1957) was a Hungarian and
+    American pure and applied mathematician, physicist, inventor and polymath."`
+
+then IEPY's task is to identify "``John von Neumann``" and
+"``December 28, 1903``" as the subject and object entities of the "``was born in``"
+relation.
+
+It's aimed at:
+    - `users <http://iepy.readthedocs.org/en/latest/active_learning_tutorial.html>`_
+      needing to perform Information Extraction on a large dataset.
+    - `scientists <http://iepy.readthedocs.org/en/latest/how_to_hack.html>`_
+      wanting to experiment with new IE algorithms.
+
+Features
+--------
+
+    - `A corpus annotation tool <http://iepy.readthedocs.org/en/latest/corpus_labeling.html>`_
+      with a `web-based UI <http://iepy.readthedocs.org/en/latest/corpus_labeling.html#document-based-labeling>`_
+    - `An active learning relation extraction tool <http://iepy.readthedocs.org/en/latest/active_learning_tutorial.html>`_
+      pre-configured with convenient defaults.
+    - `A rule based relation extraction tool <http://iepy.readthedocs.org/en/latest/rules_tutorial.html>`_
+      for cases where the documents are semi-structured or high precision is required.
+    - A web-based user interface that:
+        - Allows layman users to control some aspects of IEPY.
+        - Allows decentralization of human input.
+    - A shallow entity ontology with coreference resolution via `Stanford CoreNLP <http://nlp.stanford.edu/software/corenlp.shtml>`_
+    - `An easily hack-able active learning core <http://iepy.readthedocs.org/en/latest/how_to_hack.html>`_,
+      ideal for scientist wanting to experiment with new algorithms.
+
+Installation
+------------
+
+Install the required packages:
+
+.. code-block:: bash
+
+    sudo apt-get install build-essential python3-dev liblapack-dev libatlas-dev gfortran openjdk-7-jre
+
+Then simply install with **pip**:
+
+.. code-block:: bash
+
+    pip install iepy
+
+Full details about the installation is available on the
+`Read the Docs <http://iepy.readthedocs.org/en/latest/installation.html>`__ page.
+
+Running the tests
+-----------------
+
+If you are contributing to the project and want to run the tests, all you have to do is:
+
+    - Make sure your JAVAHOME is correctly set. `Read more about it here <http://iepy.readthedocs.io/en/latest/installation.html#install-iepy-package>`_
+    - In the root of the project run `nosetests`
+
+Learn more
+----------
+
+The full documentation is available on `Read the Docs <http://iepy.readthedocs.org/en/latest/>`__.
+
+
+Authors
+-------
+
+IEPY is © 2014 `Machinalis <http://www.machinalis.com/>`_ in collaboration
+with the `NLP Group at UNC-FaMAF <http://pln.famaf.unc.edu.ar/>`_. Its primary
+authors are:
+
+ * Rafael Carrascosa <rcarrascosa@machinalis.com> (rafacarrascosa at github)
+ * Javier Mansilla <jmansilla@machinalis.com> (jmansilla at github)
+ * Gonzalo García Berrotarán <ggarcia@machinalis.com> (j0hn at github)
+ * Franco M. Luque <francolq@famaf.unc.edu.ar> (francolq at github)
+ * Daniel Moisset <dmoisset@machinalis.com> (dmoisset at github)
+
+You can follow the development of this project and report issues at
+http://github.com/machinalis/iepy
+
+You can join the mailing list `here <https://groups.google.com/forum/?hl=es-419#%21forum/iepy>`__

+ 1 - 0
docs/Changelog

@@ -0,0 +1 @@
+../ChangeLog

+ 153 - 0
docs/Makefile

@@ -0,0 +1,153 @@
+# Makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line.
+SPHINXOPTS    =
+SPHINXBUILD   = sphinx-build
+PAPER         =
+BUILDDIR      = _build
+
+# Internal variables.
+PAPEROPT_a4     = -D latex_paper_size=a4
+PAPEROPT_letter = -D latex_paper_size=letter
+ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
+# the i18n builder cannot share the environment and doctrees with the others
+I18NSPHINXOPTS  = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
+
+.PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext
+
+help:
+	@echo "Please use \`make <target>' where <target> is one of"
+	@echo "  html       to make standalone HTML files"
+	@echo "  dirhtml    to make HTML files named index.html in directories"
+	@echo "  singlehtml to make a single large HTML file"
+	@echo "  pickle     to make pickle files"
+	@echo "  json       to make JSON files"
+	@echo "  htmlhelp   to make HTML files and a HTML help project"
+	@echo "  qthelp     to make HTML files and a qthelp project"
+	@echo "  devhelp    to make HTML files and a Devhelp project"
+	@echo "  epub       to make an epub"
+	@echo "  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
+	@echo "  latexpdf   to make LaTeX files and run them through pdflatex"
+	@echo "  text       to make text files"
+	@echo "  man        to make manual pages"
+	@echo "  texinfo    to make Texinfo files"
+	@echo "  info       to make Texinfo files and run them through makeinfo"
+	@echo "  gettext    to make PO message catalogs"
+	@echo "  changes    to make an overview of all changed/added/deprecated items"
+	@echo "  linkcheck  to check all external links for integrity"
+	@echo "  doctest    to run all doctests embedded in the documentation (if enabled)"
+
+clean:
+	-rm -rf $(BUILDDIR)/*
+
+html:
+	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
+	@echo
+	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
+
+dirhtml:
+	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
+	@echo
+	@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
+
+singlehtml:
+	$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
+	@echo
+	@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
+
+pickle:
+	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
+	@echo
+	@echo "Build finished; now you can process the pickle files."
+
+json:
+	$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
+	@echo
+	@echo "Build finished; now you can process the JSON files."
+
+htmlhelp:
+	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
+	@echo
+	@echo "Build finished; now you can run HTML Help Workshop with the" \
+	      ".hhp project file in $(BUILDDIR)/htmlhelp."
+
+qthelp:
+	$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
+	@echo
+	@echo "Build finished; now you can run "qcollectiongenerator" with the" \
+	      ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
+	@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/IEPY.qhcp"
+	@echo "To view the help file:"
+	@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/IEPY.qhc"
+
+devhelp:
+	$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
+	@echo
+	@echo "Build finished."
+	@echo "To view the help file:"
+	@echo "# mkdir -p $$HOME/.local/share/devhelp/IEPY"
+	@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/IEPY"
+	@echo "# devhelp"
+
+epub:
+	$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
+	@echo
+	@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
+
+latex:
+	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
+	@echo
+	@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
+	@echo "Run \`make' in that directory to run these through (pdf)latex" \
+	      "(use \`make latexpdf' here to do that automatically)."
+
+latexpdf:
+	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
+	@echo "Running LaTeX files through pdflatex..."
+	$(MAKE) -C $(BUILDDIR)/latex all-pdf
+	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
+
+text:
+	$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
+	@echo
+	@echo "Build finished. The text files are in $(BUILDDIR)/text."
+
+man:
+	$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
+	@echo
+	@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
+
+texinfo:
+	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
+	@echo
+	@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
+	@echo "Run \`make' in that directory to run these through makeinfo" \
+	      "(use \`make info' here to do that automatically)."
+
+info:
+	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
+	@echo "Running Texinfo files through makeinfo..."
+	make -C $(BUILDDIR)/texinfo info
+	@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
+
+gettext:
+	$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
+	@echo
+	@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
+
+changes:
+	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
+	@echo
+	@echo "The overview file is in $(BUILDDIR)/changes."
+
+linkcheck:
+	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
+	@echo
+	@echo "Link check complete; look for any errors in the above output " \
+	      "or in $(BUILDDIR)/linkcheck/output.txt."
+
+doctest:
+	$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
+	@echo "Testing of doctests in the sources finished, look at the " \
+	      "results in $(BUILDDIR)/doctest/output.txt."

+ 151 - 0
docs/active_learning_tutorial.rst

@@ -0,0 +1,151 @@
+Running the active learning core
+================================
+
+The active learning core works by trying to predict the relations using information provided by the user.
+This means you'll have to label some of the examples and based on those, the core will infer the rest.
+The core will also give you to label the more important examples (those which best helps
+to figure out the other cases).
+
+To start using it you'll need to define a relation, run the core, label some evidence and re-run the core loop.
+You can also label evidences and re-run the core as much as you like to have a better performance.
+
+Creating a relation
+-------------------
+
+To create a relation, first `open up the web server <tutorial.html#open-the-web-interface>`__ if you haven't already, and use a
+web browser to navigate on `http://127.0.0.1:8000 <http://127.0.0.1:8000>`_.
+There you'll find instructions on how to create a relation.
+
+Running the core
+----------------
+
+After creating a relation, you can start the core to look for instances of that relation.
+
+You can run this core in two modes: **High precision** or **high recall**.
+`Precision and recall <http://en.wikipedia.org/wiki/Precision_and_recall>`_ can be traded with one another up to a certain point.  I.e. it is possible to trade some
+recall to get better precision and vice versa.
+
+To visualize better this trade off, lets see an example:
+A precision of 99% means that 1 of every 100 predicted relations will be wrong and the rest will be correct.
+A recall of 30% means that only 30 out of 100 existent relations will be detected by the algorithm and the rest
+will be wrongly discarded as "no relation present".
+
+Run the active learning core by doing:
+
+.. code-block:: bash
+
+    python bin/iepy_runner.py <relation_name> <output>
+
+And add ``--tune-for=high-prec`` or ``--tune-for=high-recall`` before the relation name to switch
+between modes. The default is **high precision**.
+
+This will run until it needs you to label some of the evidences. At this point, what you
+need to do is go to the web interface that you ran on the previous step, and there you
+can label some evidences.
+
+When you consider that is enough, on the prompt that the iepy runner presented you,
+continue the execution by typing **run**.
+
+That will cycle again and repeat the process.
+
+Run the active learning core in the command line and ask it to **STOP**.
+It'll save a csv with the automatic classifications for all evidences in the database.
+
+Also, note that you can only predict a relation for a text that has been inserted into the database.
+The csv output file has the primary key of an object in the database that represents the evidence that 
+was classified as "relation present" or "relation not present". An evidence object in the database is a
+rich-in-information object containing the entities and circumstances surrounding the prediction that 
+is too complex to put in a single csv file.
+
+In order to access the entities and other details you'll need to write a script 
+to talk with the database (see iepy/data/models.py).
+
+
+Fine tuning
+-----------
+
+If you want to modify the internal behavior, you can change the settings file. On your instance
+folder you'll fine a file called ``extractor_config.json``. There you've all the configuration
+for the internal classifier, such as:
+
+Classifier
+..........
+
+This sets the classifier algorithm to be used, you can choose from:
+
+    * sgd: `Stochastic Gradient Descent <http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDClassifier.html>`_
+    * knn: `Nearest Neighbors <http://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html#sklearn.neighbors.KNeighborsClassifier>`_
+    * svc `(default)`: `C-Support Vector Classification <http://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html>`_
+    * randomforest: `Random Forest <http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html>`_
+    * adaboost: `AdaBoost <http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostClassifier.html>`_
+
+Features
+........
+
+Features to be used in the classifier, you can use a subset of:
+
+    * number_of_tokens
+    * symbols_in_between
+    * in_same_sentence
+    * verbs_count
+    * verbs_count_in_between
+    * total_number_of_entities
+    * other_entities_in_between
+    * entity_distance
+    * entity_order
+    * bag_of_wordpos_bigrams_in_between
+    * bag_of_wordpos_in_between
+    * bag_of_word_bigrams_in_between
+    * bag_of_pos_in_between
+    * bag_of_words_in_between
+    * bag_of_wordpos_bigrams
+    * bag_of_wordpos
+    * bag_of_word_bigrams
+    * bag_of_pos
+    * bag_of_words
+
+These can be added as `sparse` adding them into the
+`sparse_features` section or added as `dense` into the `dense_features`.
+
+The features in the sparse section will go through a stage of linear dimension reduction
+and the dense features, by default, will be used with a non-linear classifier.
+
+
+Viewing predictions on the web user interface
+---------------------------------------------
+
+If you prefer to review the predictions using the web interface is possible to run the
+active learning core in a way that stores the results on the database and they are accesible
+through the web.
+
+To do so, you'll have to run the core like this:
+
+.. code-block:: bash
+
+    python bin/iepy_runner.py --db-store <relation_name> 
+
+We do not have an specialized interface to review predictions but you can still view them
+by using the :doc:`interface to create a reference corpus <corpus_labeling>`.
+
+This way, you'll get labels as a new **judge** called iepy-run and a date.
+
+.. image:: labels_by_iepy.png
+
+
+Saving predictor for later use
+------------------------------
+
+Since training could be a slow process, you might want to save your trained predictor and
+re-use it several times without the need to train again.
+
+You can save it this by doing:
+
+.. code-block:: bash
+
+    python bin/iepy_runner.py --store-extractor=myextractor.pickle <relation_name> <output>
+
+And re use it like this:
+
+.. code-block:: bash
+
+    python bin/iepy_runner.py --trained-extractor=myextractor.pickle <relation_name> <output>

+ 244 - 0
docs/conf.py

@@ -0,0 +1,244 @@
+# -*- coding: utf-8 -*-
+#
+# IEPY documentation build configuration file, created by
+# sphinx-quickstart on Wed Apr 23 20:02:15 2014.
+#
+# This file is execfile()d with the current directory set to its containing dir.
+#
+# Note that not all possible configuration values are present in this
+# autogenerated file.
+#
+# All configuration values have a default; values that are commented out
+# serve to show the default.
+
+import sys, os
+
+AUTHORS = (u'Rafael Carrascosa, Javier Mansilla, Gonzalo García Berrotarán, '
+           'Daniel Moisset, Franco M. Luque')
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+#sys.path.insert(0, os.path.abspath('.'))
+
+# -- General configuration -----------------------------------------------------
+
+# If your documentation needs a minimal Sphinx version, state it here.
+#needs_sphinx = '1.0'
+
+# Add any Sphinx extension module names here, as strings. They can be extensions
+# coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
+extensions = ['sphinx.ext.viewcode']
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+# The suffix of source filenames.
+source_suffix = '.rst'
+
+# The encoding of source files.
+#source_encoding = 'utf-8-sig'
+
+# The master toctree document.
+master_doc = 'index'
+
+# General information about the project.
+project = u'IEPY'
+copyright = u'2014, ' + AUTHORS
+
+# The version info for the project you're documenting, acts as replacement for
+# |version| and |release|, also used in various other places throughout the
+# built documents.
+#
+# The short X.Y version.
+from os import path as _p
+with open(_p.join(_p.dirname(_p.abspath(__file__)), '..', 'iepy', 'version.txt')) as vfile:
+    version = vfile.readline().strip()
+# The full version, including alpha/beta/rc tags.
+release = version
+
+# The language for content autogenerated by Sphinx. Refer to documentation
+# for a list of supported languages.
+#language = None
+
+# There are two options for replacing |today|: either, you set today to some
+# non-false value, then it is used:
+#today = ''
+# Else, today_fmt is used as the format for a strftime call.
+#today_fmt = '%B %d, %Y'
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+exclude_patterns = ['_build']
+
+# The reST default role (used for this markup: `text`) to use for all documents.
+#default_role = None
+
+# If true, '()' will be appended to :func: etc. cross-reference text.
+#add_function_parentheses = True
+
+# If true, the current module name will be prepended to all description
+# unit titles (such as .. function::).
+#add_module_names = True
+
+# If true, sectionauthor and moduleauthor directives will be shown in the
+# output. They are ignored by default.
+#show_authors = False
+
+# The name of the Pygments (syntax highlighting) style to use.
+pygments_style = 'sphinx'
+
+# A list of ignored prefixes for module index sorting.
+#modindex_common_prefix = []
+
+
+# -- Options for HTML output ---------------------------------------------------
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+html_theme = "sphinx_rtd_theme"
+
+# Theme options are theme-specific and customize the look and feel of a theme
+# further.  For a list of options available for each theme, see the
+# documentation.
+#html_theme_options = {}
+
+# Add any paths that contain custom themes here, relative to this directory.
+html_theme_path = [os.getenv('VIRTUAL_ENV', '') + '/lib/python3.4/site-packages']
+
+# The name for this set of Sphinx documents.  If None, it defaults to
+# "<project> v<release> documentation".
+#html_title = None
+
+# A shorter title for the navigation bar.  Default is the same as html_title.
+#html_short_title = None
+
+# The name of an image file (relative to this directory) to place at the top
+# of the sidebar.
+#html_logo = None
+
+# The name of an image file (within the static path) to use as favicon of the
+# docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
+# pixels large.
+#html_favicon = None
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ['_static']
+
+# If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
+# using the given strftime format.
+#html_last_updated_fmt = '%b %d, %Y'
+
+# If true, SmartyPants will be used to convert quotes and dashes to
+# typographically correct entities.
+#html_use_smartypants = True
+
+# Custom sidebar templates, maps document names to template names.
+#html_sidebars = {}
+
+# Additional templates that should be rendered to pages, maps page names to
+# template names.
+#html_additional_pages = {}
+
+# If false, no module index is generated.
+#html_domain_indices = True
+
+# If false, no index is generated.
+#html_use_index = True
+
+# If true, the index is split into individual pages for each letter.
+#html_split_index = False
+
+# If true, links to the reST sources are added to the pages.
+#html_show_sourcelink = True
+
+# If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
+#html_show_sphinx = True
+
+# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
+#html_show_copyright = True
+
+# If true, an OpenSearch description file will be output, and all pages will
+# contain a <link> tag referring to it.  The value of this option must be the
+# base URL from which the finished HTML is served.
+#html_use_opensearch = ''
+
+# This is the file name suffix for HTML files (e.g. ".xhtml").
+#html_file_suffix = None
+
+# Output file base name for HTML help builder.
+htmlhelp_basename = 'IEPYdoc'
+
+
+# -- Options for LaTeX output --------------------------------------------------
+
+latex_elements = {
+# The paper size ('letterpaper' or 'a4paper').
+#'papersize': 'letterpaper',
+
+# The font size ('10pt', '11pt' or '12pt').
+#'pointsize': '10pt',
+
+# Additional stuff for the LaTeX preamble.
+#'preamble': '',
+}
+
+# Grouping the document tree into LaTeX files. List of tuples
+# (source start file, target name, title, author, documentclass [howto/manual]).
+latex_documents = [
+    ('index', 'IEPY.tex', u'IEPY Documentation', AUTHORS, 'manual'),
+]
+
+# The name of an image file (relative to this directory) to place at the top of
+# the title page.
+#latex_logo = None
+
+# For "manual" documents, if this is true, then toplevel headings are parts,
+# not chapters.
+#latex_use_parts = False
+
+# If true, show page references after internal links.
+#latex_show_pagerefs = False
+
+# If true, show URL addresses after external links.
+#latex_show_urls = False
+
+# Documents to append as an appendix to all manuals.
+#latex_appendices = []
+
+# If false, no module index is generated.
+#latex_domain_indices = True
+
+
+# -- Options for manual page output --------------------------------------------
+
+# One entry per manual page. List of tuples
+# (source start file, name, description, authors, manual section).
+man_pages = [
+    ('index', 'iepy', u'IEPY Documentation', [AUTHORS], 1)
+]
+
+# If true, show URL addresses after external links.
+#man_show_urls = False
+
+
+# -- Options for Texinfo output ------------------------------------------------
+
+# Grouping the document tree into Texinfo files. List of tuples
+# (source start file, target name, title, author,
+#  dir menu entry, description, category)
+texinfo_documents = [
+    ('index', 'IEPY', u'IEPY Documentation', AUTHORS, 'IEPY',
+     'Information Extraction python library.', 'Miscellaneous'),
+]
+
+# Documents to append as an appendix to all manuals.
+#texinfo_appendices = []
+
+# If false, no module index is generated.
+#texinfo_domain_indices = True
+
+# How to display URL addresses: 'footnote', 'no', or 'inline'.
+#texinfo_show_urls = 'footnote'

+ 105 - 0
docs/corpus_labeling.rst

@@ -0,0 +1,105 @@
+Creating a reference corpus
+===========================
+
+IEPY provides a web tool for creating a reference corpus in a simple and fast way. 
+This corpus can be used for evaluation or simply to have a labeled corpus of 
+relations between entity occurrences.
+
+
+Running the web server
+----------------------
+
+First of all, you need to run the web server that will provide the interface.
+This is done by running a *Django* server.
+
+Assuming you have an iepy instance and it's your current directory,
+to start the server you need to run 
+
+.. code-block:: sh
+    
+    $ python bin/manage.py runserver
+
+You will see a message like this:
+
+::
+
+    Starting development server at http://127.0.0.1:8000/
+    Quit the server with CONTROL-C.
+
+Home page
+---------
+
+At this point, you can go on and open a browser and access the URL `http://127.0.0.1:8000 <http://127.0.0.1:8000/>`_
+and you will get a screen like this:
+
+.. image:: home_screenshot.png
+
+
+After creating a relation, you can access it on the ``Create a reference corpus`` section of the home page.
+Once you get there, you'll find that there are two different ways to label evidences: by segment and by document.
+The default one is by document but you can switch between both of them.
+
+
+Document based labeling
+-----------------------
+
+This view presents a complete document with all the segments that make sense to show. These are
+the ones that have present entities with the entity kind that your relation uses.
+
+.. image:: label_by_document_screenshot.png
+
+On the left side of the page, you'll see a list of the options that you have to label the evidences.
+To start labeling information what you need to do is choose one of this options, then click on two
+entity occurrences (marked in yellow on the text).
+
+IEPY will only let you click on entity occurrences that has the type that your relation need. Even
+when you select the first entity occurrence, you will only be able to click on entities of the other
+entity type.
+
+.. image:: label_by_document_relation_labeled.png
+
+After saving, IEPY will take you to automatically to the next document.
+Also on top you have some navigation controls.
+
+.. note::
+
+    Be careful with the navigation buttons because it won't save the changes that you’ve made on this document. 
+
+
+Segment based labeling
+----------------------
+
+When labeling by segment, you are presented with a segment of a document, and you will have to
+answer if the relation is present on all the possible combinations of entity occurrences.
+
+.. image:: label_by_segment_screenshot.png
+
+Here what you will need to do is complete every evidence whether the relation is present or not.
+When saving you will get another segment to label and so on.
+
+On top you have navigation controls and on the far right you have link to switch view 
+for one by document.
+
+
+Fixing mistagged entity occurrences
+-----------------------------------
+
+It is possible that the automatic process that detects entities have been mistaken.
+This leads to an entity tagged partially or incorrectly. In this case, we provide a tool to fix this problems.
+You can access this tool by right clicking in the problematic entity and choosing **Modify entity occurrence** 
+
+.. image:: label_by_document_entity_edition.png
+
+There you can completely remove the entity or change the limits so it holds more (or less) tokens.
+
+
+Creating new occurrences
+------------------------
+
+If an entity occurrence wasn't detected automatically, you can add it manually. To do so, right click on
+any token and choose **Create entity occurrence**. 
+
+.. image:: create_eo.png
+
+You can modify the limits of the tokens and the entity kind there. After this operation, new *evidence candidates*
+will be created if needed.

BIN
docs/create_eo.png


+ 45 - 0
docs/gazettes.rst

@@ -0,0 +1,45 @@
+Gazettes resolution
+===================
+
+We call a gazette a mapping between a list of tokens and an entity kind. If that list of tokens
+matches exactly on your text, then that would be tagged as an entity. 
+
+All the entities occurrences that where detected by a gazette and share the same set of tokens, will share the same entity.
+This means that if you have a gazette that finds ``Dr. House`` and tags it as a ``PERSON``, all the occurrences in the text
+that matches those tokens, will belong to the same entity.
+
+Basic usage: Loading from csv
+-----------------------------
+
+The basic usage would be including a set of gazettes before running the preprocess step. To include
+the gazettes on your database, you can use the script ``gazettes_loader.py`` that comes included with
+your instance. This will take a csv file with the following format:
+
+::
+
+    <literal>,<class>
+
+Literal can be a single token or multiple tokens separated by space.
+The only restriction is that every literal is unique.
+
+For example, a gazettes csv file could be:
+
+::
+
+    literal,class
+    Dr. House,PERSON
+    Lupus,DISEASE
+    Headache,SYMPTOMS
+
+
+Removing elements
+-----------------
+
+When deleting an entity, all the occurrences are deleted with it along the gazette item that introduced them.
+Same goes the other way, if you delete a gazette item, the entity, and therefore the occurrences, will be deleted as well.
+
+To delete a gazette item, go to the database admin page and find the Gazette section. You'll be able to find the one that you want
+to remove.
+
+To remove an entity, find an occurrence by exploring a document on any of its views, and right click it. There you'll find a delete
+link that enables you to remove the whole entity. Keep in mind that this action will delete the gazette item.

BIN
docs/home_screenshot.png


+ 192 - 0
docs/how_to_hack.rst

@@ -0,0 +1,192 @@
+How to Hack
+===========
+
+There are several places where you can incorporate your own ideas and needs into IEPY.
+Here you'll see how to modify different parts of the iepy core.
+
+Altering how the corpus is created
+----------------------------------
+
+On the `preprocess <preprocess.html#how-to-customize>`_ section was already mentioned that you can customize how the corpus is created.
+
+
+Using your own classifier
+-------------------------
+
+You can change the definition of the *extraction classifier* that is used when running
+iepy in *active learning* mode.
+
+As the simplest example of doing this, check the following example.
+First, define your own custom classifier, like this:
+
+.. code-block:: python
+
+    from sklearn.linear_model import SGDClassifier
+    from sklearn.pipeline import make_pipeline
+    from sklearn.feature_extraction.text import CountVectorizer
+
+
+    class MyOwnRelationClassifier:
+        def __init__(self, **config):
+            vectorizer = CountVectorizer(
+                preprocessor=lambda evidence: evidence.segment.text)
+            classifier = SGDClassifier()
+            self.pipeline = make_pipeline(vectorizer, classifier)
+
+        def fit(self, X, y):
+            self.pipeline.fit(X, y)
+            return self
+
+        def predict(self, X):
+            return self.pipeline.predict(X)
+
+        def decision_function(self, X):
+            return self.pipeline.decision_function(X)
+
+
+and later, in iepy_runner.py of your IEPY instance, in the **ActiveLearningCore** creation,
+provide it as a configuration parameter like this
+
+
+.. code-block:: python
+
+    iextractor = ActiveLearningCore(
+        relation, labeled_evidences,
+        tradeoff=tuning_mode,
+        extractor_config={},
+        extractor=MyOwnRelationClassifier
+    )
+
+
+Implementing your own features
+------------------------------
+
+Your classifier can use features that are already built within iepy or you can create your
+own. You can even use a rule (as defined in the :doc:`rules core <rules_tutorial>`) as feature.
+
+Start by creating a new file in your instance, you can call it whatever you want, but for this
+example lets call it ``custom_features.py``. There you'll define your features:
+
+.. code-block:: python
+
+    # custom_features.py
+    from featureforge.feature import output_schema
+
+    @output_schema(int, lambda x: x >= 0)
+    def tokens_count(evidence):
+        return len(evidence.segment.tokens)
+
+
+.. note::
+
+    Your features can use some of the `Feature Forge's <http://feature-forge.readthedocs.org/en/latest/>`__
+    capabilities.
+
+Once you've defined your feature you can use it in the classifier by adding it to the configuration
+file. You should have one on your instance with all the default values, it's called ``extractor_config.json``.
+
+There you'll find 2 sets of features where you can add it: dense or sparse. Depending on the values returned
+by your feature you'll choose one over the other.
+
+To include it, you have to add a line with a python path to your feature function. If you're not familiarized with
+the format you should follow this pattern:
+
+::
+
+    {project_name}.{features_file}.{feature_function}
+
+In our example, our instance is called ``born_date``, so in the config this would be:
+
+.. code-block:: json
+
+    "dense_features": [
+        ...
+        "born_date.custom_features.tokens_count",
+        ...
+    ],
+
+Remember that if you want to use that configuration file you have to use the option ``--extractor-config``
+
+
+Using rules as features
+-----------------------
+
+In the same way, and without doing any change to the rule, you can
+add it as feature by declaring it in your config like this:
+
+Suppose your instance is called ``born_date`` and your rule is called ``born_date_in_parenthesis``,
+then you'll do:
+
+
+.. code-block:: json
+
+    "dense_features": [
+        ...
+        "born_date.rules.born_date_in_parenthesis",
+        ...
+    ],
+
+This will run your rule as a feature that returns 0 if it didn't match and 1 if it matched.
+
+Using all rules as one feature
+..............................
+
+Suppose you have a bunch of rules defined in your rules file and instead of using each rule as a
+different feature you want to use a single feature that runs all the rules to test if the evidence
+matches. You can write a custom feature that does so. Let's look an example snippet:
+
+.. code-block:: python
+
+    # custom_features.py
+    import refo
+
+    from iepy.extraction.rules import compile_rule, generate_tokens_to_match, load_rules
+
+    rules = load_rules()
+
+
+    def rules_match(evidence):
+        tokens_to_match = generate_tokens_to_match(evidence)
+
+        for rule in rules:
+            regex = compile_rule(rule, evidence.relation)
+
+            if refo.match(regex, tokens_to_match):
+                if rule.answer:  # positive rule
+                    return 1
+                else:  # negative rule
+                    return -1
+        # no rule matched
+        return 0
+
+
+This will define a feature called ``rules_match`` that tries every rule for an evidence
+until a match occurs, and returns one of three different values, depending on the type
+of match.
+
+To use this you have to add this single feature to your config like this:
+
+.. code-block:: json
+
+    "dense_features": [
+        ...
+        "born_date.custom_features.rules_match",
+        ...
+    ],
+
+
+
+Documents Metadata
+------------------
+
+While building your application, you might want to store some extra information about your documents.
+To avoid loading this data every time when predicting, we've separated the place to put this 
+information into another model called **IEDocumentMetadata** that is accessible through the **metadata** attribute.
+
+IEDocumentMetadata has 3 fields:
+
+    * title: for storing document's title
+    * url: to save the source url if the document came from a web page
+    * itmes: a dictionary that you can use to store anything you want.
+
+By default, the **csv importer** uses the document's metadata to save the filepath of the csv file on the *items* field.

Fichier diff supprimé car celui-ci est trop grand
+ 0 - 0
docs/iepy_1.svg


Fichier diff supprimé car celui-ci est trop grand
+ 0 - 0
docs/iepy_2.svg


Fichier diff supprimé car celui-ci est trop grand
+ 0 - 0
docs/iepy_3.svg


Fichier diff supprimé car celui-ci est trop grand
+ 0 - 0
docs/iepy_4.svg


+ 85 - 0
docs/index.rst

@@ -0,0 +1,85 @@
+.. IEPY documentation master file, created by
+   sphinx-quickstart on Wed Apr 23 20:02:15 2014.
+   You can adapt this file completely to your liking, but it should at least
+   contain the root `toctree` directive.
+
+Welcome to IEPY's documentation!
+================================
+
+IEPY is an open source tool for
+`Information Extraction <http://en.wikipedia.org/wiki/Information_extraction>`_
+focused on Relation Extraction.
+
+To give an example of Relation Extraction, if we are trying to find a
+birth date in:
+
+    `"John von Neumann (December 28, 1903 – February 8, 1957) was a Hungarian and
+    American pure and applied mathematician, physicist, inventor and polymath."`
+
+then IEPY's task is to identify "``John von Neumann``" and
+"``December 28, 1903``" as the subject and object entities of the "``was born in``"
+relation.
+
+It's aimed at:
+    - :doc:`users <active_learning_tutorial>`
+      needing to perform Information Extraction on a large dataset.
+    - :doc:`scientists <how_to_hack>`
+      wanting to experiment with new IE algorithms.
+
+
+You can follow the development of this project and report issues at http://github.com/machinalis/iepy
+or join the mailing list `here <https://groups.google.com/forum/?hl=es-419#%21forum/iepy>`__
+
+Features
+--------
+
+    - :doc:`A corpus annotation tool <corpus_labeling>`
+      with a `web-based UI <corpus_labeling.html#document-based-labeling>`_
+    - :doc:`An active learning relation extraction tool <active_learning_tutorial>`
+      pre-configured with convenient defaults.
+    - :doc:`A rule based relation extraction tool <rules_tutorial>`
+      for cases where the documents are semi-structured or high precision is required.
+    - A web-based user interface that:
+        - Allows layman users to control some aspects of IEPY.
+        - Allows decentralization of human input.
+    - A shallow entity ontology with coreference resolution via `Stanford CoreNLP <http://nlp.stanford.edu/software/corenlp.shtml>`_
+    - :doc:`An easily hack-able active learning core <how_to_hack>`,
+      ideal for scientist wanting to experiment with new algorithms.
+
+
+Contents:
+---------
+
+.. toctree::
+   :maxdepth: 2
+
+   installation
+   tutorial
+   instantiation
+   active_learning_tutorial
+   rules_tutorial
+   preprocess
+   gazettes
+   corpus_labeling
+   how_to_hack
+   troubleshooting
+   language
+
+
+Authors
+-------
+
+IEPY is © 2014 `Machinalis <http://www.machinalis.com/>`_ in collaboration
+with the `NLP Group at UNC-FaMAF <http://pln.famaf.unc.edu.ar/>`_. Its primary
+authors are:
+
+ * Rafael Carrascosa <rcarrascosa@machinalis.com> (rafacarrascosa at github)
+ * Javier Mansilla <jmansilla@machinalis.com> (jmansilla at github)
+ * Gonzalo García Berrotarán <ggarcia@machinalis.com> (j0hn at github)
+ * Franco M. Luque <francolq@famaf.unc.edu.ar> (francolq at github)
+ * Daniel Moisset <dmoisset@machinalis.com> (dmoisset at github)
+
+Changelog
+---------
+
+.. include:: Changelog

+ 79 - 0
docs/installation.rst

@@ -0,0 +1,79 @@
+==================
+IEPY installation
+==================
+
+IEPY runs on *python 3*, and it's fully tested with version *3.4*.
+These installation notes assume that you have a fresh installation of *Ubuntu 14.04*.
+If you are installing IEPY on a different platform, some details
+or software versions may be slightly different.
+
+Because of some of its dependencies, IEPY installation is not a single
+pip install, but it's actually not that hard.
+
+Outline:
+    - install some system packages
+    - install iepy itself
+    - download 3rd party binaries
+
+
+System software needed
+----------------------
+
+You need to install the following packages:
+
+.. code-block:: bash
+
+    sudo apt-get install build-essential python3-dev liblapack-dev libatlas-dev gfortran
+
+They are needed for python Numpy installation. Once this is done, install numpy by doing:
+
+.. code-block:: bash
+
+    pip install numpy
+
+
+And later, for been able to run some java processes:
+
+.. code-block:: bash
+
+    sudo apt-get install openjdk-7-jre
+
+.. note::
+
+    Instead of openjdk-7-jre you can use any other java (version 1.6 or higher) you
+    may have.
+
+    **Java 1.8** will allow you to use the **newest preprocess models**.
+
+
+Install IEPY package
+--------------------
+
+1. :doc:`Create a Virtualenv <virtualenv>`
+
+2. Install IEPY itself
+
+.. code-block:: bash
+
+    pip install iepy
+
+3. Configure java & NLTK
+
+    In order to preprocess documents, set the
+    environment variable JAVAHOME=/usr/bin/java (or the path where java was installed)
+    To make this configuration persistent, add it to your shell rc file.
+
+Download the third party data and tools
+---------------------------------------
+
+You should have now a command named "*iepy*". Use it like this to get some required
+binaries.
+
+.. code-block:: bash
+
+    iepy --download-third-party-data
+
+.. note::
+
+    If the java binary pointed by your JAVAHOME is 1.8, newest preprocess models will
+    be acquired and used.

+ 137 - 0
docs/instantiation.rst

@@ -0,0 +1,137 @@
+Instantiation
+=============
+
+Here, we'll explain in detail what an instantiation contains and what it does.
+
+Folder structure
+----------------
+
+The folder structure of an iepy instance is the following:
+
+.. code-block:: bash
+
+    ├── __init__.py
+    ├── settings.py
+    ├── database_name_you_picked.sqlite
+    ├── bin
+    │   ├── csv_to_iepy.py
+    │   ├── iepy_rules_runner.py
+    │   ├── iepy_runner.py
+    │   ├── manage.py
+    │   ├── preprocess.py
+    │   └── rules_verifier.py
+    ├── extractor_config.json
+    └── rules.py
+
+
+Let's see why each one of those files is there:
+
+
+Settings file
+.............
+
+settings.py is a configuration file where you can change the database settings and all the web interface related settings.
+This file has a `django settings <https://docs.djangoproject.com/en/1.7/ref/settings/>`_ file format.
+
+Database
+........
+
+When you create an instance, a *sqlite* database is created by default.
+It has no data yet, since you'll have to fill it with your own data.
+
+When working with big datasets, it's recommended to use some database engine other than *sqlite*.
+To change the database engine, change the `DATABASES` section of the settings file:
+
+::
+
+    DATABASES = {
+        'default': {
+            'ENGINE': 'django.db.backends.sqlite3',
+            'NAME': 'database_name_you_picked.sqlite',
+        }
+    }
+
+For example, you can use PostgreSQL like this:
+
+::
+
+    DATABASES = {
+        'default': {
+            'ENGINE': 'django.db.backends.postgresql_psycopg2',
+            'NAME': 'your_database_name',
+        }
+    }
+
+(Remember that you'll need to install ``psycopg2`` first with a simple ``pip install psycopg2``)
+
+Take a look at the `django database configuration documentation <https://docs.djangoproject.com/en/dev/ref/settings/#databases>`_ for more detail.
+
+.. note::
+
+    Each time you change your database (either the engine or the name) you will have
+    to instruct *django* to create all the tables in it, like this:
+
+    .. code-block:: bash
+
+        python bin/manage.py migrate
+
+
+Active learning configuration
+.............................
+
+``extractor_config.json`` specifies the configuration of the active learning core in *json* format.
+
+Rules definition
+................
+
+If you decide to use the rule based core, you'll have to define all your rules in the file ``rules.py``
+
+You can verify if your rules run correctly using ``bin/rules_verifier.py``.
+Read more about it `here <rules_tutorial.html#verifying-your-rules>`__.
+
+CSV importer
+............
+
+In the ``bin`` folder, you'll find a tool to import data from CSV files. This is the script ``csv_to_iepy.py``.
+Your CSV data has to be in the following format:
+
+::
+
+    <document_id>, <document_text>
+
+Preprocess
+..........
+
+To preprocess your data, you will use the  ``bin/preprocess.py`` script. Read more about it :doc:`here <preprocess>`
+
+Runners
+.......
+
+In the ``bin`` folder, you have scripts to run the active learning core (``iepy_runner.py``) or the
+rule based core (``iepy_rules_runner.py``)
+
+Web UI management
+.................
+
+For the web server management, you have the ``bin/manage.py`` script. This is a `django manage file <https://docs.djangoproject.com/en/1.7/ref/django-admin/>`_
+and with it you can start up your server.
+
+
+Instance Upgrade
+----------------
+
+From time to time, small changes in the iepy internals will require an *upgrade* of existing iepy instances.
+
+The upgrade process will apply the needed changes to the instance-folder structure.
+
+If you made local changes, the tool will preserve a copy of your changes so you can merge the conflicting areas by hand.
+
+To upgrade an iepy instance, simply run the following command
+
+    .. code-block:: bash
+
+        iepy --upgrade <instance path>
+
+.. note::
+
+    Look at the settings file to find the version of any iepy instance.

BIN
docs/label_by_document_entity_edition.png


BIN
docs/label_by_document_relation_labeled.png


BIN
docs/label_by_document_screenshot.png


BIN
docs/label_by_segment_screenshot.png


BIN
docs/labels_by_iepy.png


+ 60 - 0
docs/language.rst

@@ -0,0 +1,60 @@
+==================
+Language support
+==================
+
+By default IEPY will use English models, but it's also able to work with different
+languages.
+
+The preprocess machinery that's provided by default (Stanford Core NLP) has support
+for some other languages, so, check their models and documentation in case you need this.
+
+.. note::
+
+    The main goal until now was to architecture IEPY to allow different languages.
+    Right now, the only fully supported languages are English, Spanish and German. If you need
+    something else, do not hesitate in contacting us.
+
+
+Language Installation and Models
+--------------------------------
+
+The language models used by IEPY (the information used during preprocessing phase)
+are stored on your IEPY installation. Several models for different languages can be
+installed on the same installation.
+
+In order to download Spanish models you should run
+
+.. code-block:: bash
+
+    iepy --download-third-party-data --lang=es
+
+
+In order to download German models you should run
+
+.. code-block:: bash
+
+    iepy --download-third-party-data --lang=de
+
+
+.. note::
+
+    Check Stanford Core NLP documentation and files to download for more language packages.
+
+
+Language Definition and Instances
+---------------------------------
+
+Every IEPY instance works for a single language, which is declared on the settings.py file like this:
+
+To change the instance language, change the settings file on the section where it says `IEPY_VERSION`:
+
+::
+
+    IEPY_VERSION = 'en'
+
+
+To create an IEPY instance for a different language, you should run
+
+.. code-block:: bash
+
+    iepy --create --lang=es <folder_path>

+ 231 - 0
docs/preprocess.rst

@@ -0,0 +1,231 @@
+About the Pre-Process
+=====================
+
+The preprocessing adds the metadata that iepy needs to detect the relations, which includes:
+
+    * Text tokenization and sentence splitting.
+    * Text lemmatization
+    * Part-Of-Speech (POS) tagging.
+    * Named Entity Recognition (NER).
+    * Gazettes resolution
+    * Syntactic parsing.
+    * TextSegments creation (internal IEPY text unit).
+
+We're currently running all this steps (except the last one) using the `Stanford CoreNLP <http://nlp.stanford.edu/software/corenlp.shtml>`_ tools.
+This runs in a all-in-one run, but every step can be :ref:`modified to use a custom version <customize>` that adjust your needs.
+
+
+About the Tokenization and Sentence splitting
+---------------------------------------------
+
+The text of each Document is split on tokens and sentences, and that information is stored
+on the document itself, preserving (and also storing) for each token the offset (in chars)
+to the original document text.
+
+The one used by default it's the one that the `Stanford CoreNLP <http://nlp.stanford.edu/software/corenlp.shtml>`_ provides.
+
+.. note::
+
+    While using the Stanford tokenizer, you can customize some of tokenization options.
+
+    First read here: `tokenizer options <http://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/process/PTBTokenizer.html>`_
+
+    On your instance *settings.py* file, add options as keys on the CORENLP_TKN_OPTS dict.
+    You can use as key any of the "known options", and as value,
+    use True or False for booleans, or just strings when option requires a text.
+    Example:
+
+    .. code-block:: python
+
+        CORENLP_TKN_OPTS = {
+            'latexQuotes': False
+        }
+
+
+Lemmatization
+-------------
+
+.. note::
+
+    Lemmatization was added on the version 0.9.2, all instances that were created before that,
+    need to run the preprocess script again. This will run only the lemmatization step.
+
+The text runs through a step of lemmatization where each token gets a lemma. This is a canonical form of the word that
+can be used in the classifier features or the rules core.
+
+
+Part of speech tagging (POS)
+----------------------------
+
+Each token is augmented with metadata about its part of speech such as noun, verb,
+adjective and other grammatical tags.
+Along the token itself, this may used by the NER to detect an entity occurrence.
+This information is also stored on the Document itself, together with the tokens.
+
+The one used by default it's the one that the `Stanford CoreNLP <http://nlp.stanford.edu/software/corenlp.shtml>`_ provides.
+
+Named Entity Recognition (NER)
+------------------------------
+
+To find a relation between entities one must first recognize these entities in the text.
+
+As an result of NER, each document is added with information about all the found
+Named Entities (together with which tokens are involved in each occurrence).
+
+An automatic NER is used to find occurrences of an entity in the text.
+
+The default pre-process uses the Stanford NER, check the Stanford CoreNLP's `documentation <http://nlp.stanford.edu/software/corenlp.shtml>`_
+to find out which entity kinds are supported, but includes:
+
+    * Location
+    * Person
+    * Organization
+    * Date
+    * Number
+    * Time
+    * Money
+    * Percent
+
+Others remarkable features of this NER (that are incorporated to the default pre-process) are:
+
+    - pronoun resolution
+    - simple co-reference resolution
+
+This step can be customized to find entities of kinds defined by you, or anything else you may need.
+
+Gazettes resolution
+-------------------
+
+In case you want to add named entity recognition by matching literals, iepy provides a system of gazettes.
+This is a mapping of literals and entity kinds that will be run on top of the basic stanford NER.
+With this, you'll be able to recognize entities out of the ones done by the stanford NER, or even correct
+those that are incorrectly tagged.
+
+:doc:`Learn more about here. <gazettes>`
+
+
+Syntactic parsing
+-----------------
+
+.. note::
+
+    Syntactic parsing was added on the version 0.9.3, all instances that were created before that,
+    need to run the preprocess script again. This will run only the syntactic parsing step.
+
+The sentences are parsed to works out the syntactic structure. Each sentence gets an structure tree
+that is stored in `Penn Treebank notation <http://en.wikipedia.org/wiki/Treebank>`__. IEPY presents
+this to the user using a `NLTK Tree object <http://www.nltk.org/howto/tree.html>`__.
+
+By default the sentences are processed with the `Stanford Parser <http://nlp.stanford.edu/software/lex-parser.shtml>`__
+provided within the `Stanford CoreNLP <http://nlp.stanford.edu/software/corenlp.shtml>`__.
+
+For example, the syntactic parsing of the sentence ``Join the dark side, we have cookies`` would be:
+
+::
+
+    (ROOT
+      (S
+        (S
+          (VP (VBN Join)
+            (NP (DT the) (JJ dark) (NN side))))
+        (, ,)
+        (NP (PRP we))
+        (VP (VBP have)
+          (NP (NNS cookies)))))
+
+About the Text Segmentation
+---------------------------
+
+IEPY works on a **text segment** (or simply **segment**) level, meaning that will
+try to find if a relation is present within a segment of text. The
+pre-process is the responsible for splitting the documents into segments.
+
+The default pre-process uses a segmenter that creates for documents with the following criteria:
+
+ * for each sentence on the document, if there are at least 2 Entity Occurrences in there
+
+
+.. _customize:
+
+How to customize
+----------------
+
+On your own IEPY instances, there's a file called ``preprocess.py`` located in the ``bin`` folder.
+There you'll find that the default is simply run the Stanford preprocess, and later the segmenter.
+This can be changed to run a sequence of steps defined by you
+
+For example, take this pseudo-code to guide you:
+
+.. code-block:: python
+
+    pipeline = PreProcessPipeline([
+        CustomTokenizer(),
+        CustomSentencer(),
+        CustomLemmatizer(),
+        CustomPOSTagger(),
+        CustomNER(),
+        CustomSegmenter(),
+    ], docs)
+    pipeline.process_everything()
+
+
+.. note::
+
+    The steps can be functions or callable objects. We recommend objects because generally you'll
+    want to do some load up of things on the `__init__` method to avoid loading everything over and over again.
+
+Each one of those steps will be called with each one of the documents, meaning that every step will be called
+with all the documents, after finishing with that the next step will be called with each one of the documents.
+
+
+Running in multiple cores
+-------------------------
+
+Preprocessing might take a lot of time. To handle this you can run the preprocessing on several cores of the
+same machine or even run it on differents machines to accelerate the processing.
+
+To run it on the same machine using multiple cores, all you need to do is run:
+
+.. code-block:: bash
+
+    $ python bin/preprocess.py --multiple-cores=all
+
+This will use all the available cores. You can also specify a number if you want to
+use less than that, like this:
+
+.. code-block:: bash
+
+    $ python bin/preprocess.py --multiple-cores=2
+
+Running in multiple machines
+----------------------------
+
+Running the preprocess on different machines it's a bit tricky, here's what you'll need:
+
+    * A iepy instance with a database that allows remote access (such as postgres)
+    * One iepy instance on each extra machine that has the database setting pointing to the main one.
+
+Then you'll need to decide on how many parts do you want to split the document set
+and run each part on a different machine. For example, you could split the documents in 4 and run 2 processes
+on one machine and 2 on another one. To do this you'll run:
+
+
+On one of the machines, in two different consoles run:
+
+.. code-block:: bash
+
+    $ python bin/preprocess.py --split-in=4 --run-part=1
+
+.. code-block:: bash
+
+    $ python bin/preprocess.py --split-in=4 --run-part=2
+
+And on the other machine:
+
+.. code-block:: bash
+
+    $ python bin/preprocess.py --split-in=4 --run-part=3
+
+.. code-block:: bash
+
+    $ python bin/preprocess.py --split-in=4 --run-part=4

+ 188 - 0
docs/rules_tutorial.rst

@@ -0,0 +1,188 @@
+Running the rule based core
+===========================
+
+Here we will guide you through the steps to use the rule based system
+to detect relations on the documents.
+
+
+How they work
+-------------
+
+In the rule based system, you have to define a set of "regular expression like" rules
+that will be tested against the segments of the documents. Roughly speaking,
+if a rule matches it means that the relation is present.
+
+This is used to acquire high precision because you control exactly what is matched.
+
+
+Anatomy of a rule
+-----------------
+
+.. note::
+    If you don't know how to define a python function,
+    `check this out <https://docs.python.org/3/tutorial/controlflow.html#defining-functions>`_
+
+
+A rule is basically a *decorated python function*.
+We will see where this needs to be added later, for now lets concentrate on how it is written.
+
+.. code-block:: python
+
+    @rule(True)
+    def born_date_and_death_in_parenthesis(Subject, Object):
+        """ Example: Carl Bridgewater (January 2, 1965 - September 19, 1978) was shot dead """
+        anything = Star(Any())
+        return Subject + Pos("-LRB-") + Object + Token("-") + anything + Pos("-RRB-") + anything
+
+First you have to specify that your function is in fact a rule by using the **decorator @rule**.
+
+As you can see in the first line, this is added on top of the function.
+In this decorator you have to define if the rule is going to be *positive* or *negative*. A positive
+rule that matches will label the relations as present and a negative one will label it as not present.
+You can define this by passing the True or False parameter to the rule decorator.
+
+Then it comes the definition of the function. This functions takes two parameters: the **Subject** and the **Object**.
+This are patterns that will be part of the regex that the function has to return.
+
+After that it comes the body of the function. Here it has to be constructed the regular expression and needs to be
+returned by the function.  This is not an ordinary regular expression, it
+uses `ReFO <https://github.com/machinalis/refo>`_.
+In ReFO you have to operate with objects that does some kind of check to the text segment.
+
+For our example, we've chosen to look for the *Was Born* relation. Particularly we look for the date of birth of a
+person when it is written like this:
+
+::
+
+    Carl Bridgewater (January 2, 1965 - September 19, 1978)
+
+To match this kind of cases, we have to specify the regex as a sum of predicates. This will check if every
+part matches.
+
+Rule's building blocks
+----------------------
+
+Aside of every ReFO predicates, iepy comes with a bunch that you will find useful for creating your own rules
+
+    * **Subject**: matches the evidence's left part.
+    * **Object**: matches the evidence's right part.
+    * **Token**: matches if the token is literally the one specified.
+    * **Lemma**: matches if the lemma literally the one specified.
+    * **Pos**: matches the *part of speech* of the token examined.
+    * **Kind**: matches if the token belongs to an entity occurrence with a given kind.
+
+
+Setting priority
+----------------
+
+Using the **rule decorator**, you can set that a rule is more important than another, and because of that it should
+try to match before.
+
+IEPY will run the rules ordered decreasingly by its priority number, and the default priority is 0.
+
+For example, to set a priority of 1 you do:
+
+.. code-block:: python
+
+    @rule(True, priority=1)
+    def rule_name(Subject, Object):
+        ...
+
+
+Negative rules
+--------------
+
+If you spot that your rules are matching things erroneously, you can write a rule
+that catches that before it is taken by a positive rule.
+
+You do this by setting the rule as a *negative rule* using the decorator. Also is
+recommended to set higher priority so it is checked before the other ones.
+
+Example:
+
+
+.. code-block:: python
+
+    @rule(False, priority=1)
+    def incorrect_labeling_of_place_as_person(Subject, Object):
+        """
+        Ex:  Sophie Christiane of Wolfstein (24 October 24, 1667 - 23 August 1737)
+
+        Wolfstein is a *place*, not a *person*
+        """
+        anything = Star(Any())
+        person = Plus(Pos("NNP") + Question(Token(",")))
+        return anything + person + Token("of") + Subject + anything
+
+
+Note that the parameters of the rule decorator are **False** and **priority=1**
+
+Where do I place the rules
+--------------------------
+
+On your project's instance folder, there should be a *rules.py* file. All rules should be place
+there along with a  **RELATION** variable that sets which relation is going to be used.
+
+This is the file that will be loaded when you run the *iepy_rules_runner*.
+
+
+Example
+-------
+
+This is a portion of the example provided with IEPY, you can view the `complete
+file here <https://github.com/machinalis/iepy/blob/develop/examples/birthdate/was_born_rules_sample.py>`__.
+
+.. code-block:: python
+
+    from refo import Question, Star, Any, Plus
+    from iepy.extraction.rules import rule, Token, Pos
+
+    RELATION = "was born"
+
+    @rule(True)
+    def was_born_explicit_mention(Subject, Object):
+        """
+        Ex: Shamsher M. Chowdhury was born in 1950.
+        """
+        anything = Star(Any())
+        return anything + Subject + Token("was born") + Pos("IN") + Object + anything
+
+
+    @rule(True)
+    def is_born_in(Subject, Object):
+        """
+        Ex: Xu is born in 1902 or 1903 in a family of farmers in Hubei ..
+        """
+        anything = Star(Any())
+        return Subject + Token("is born in") + Object + anything
+
+
+    @rule(True)
+    def just_born(Subject, Object):
+        """
+        Ex: Lyle Eugene Hollister, born 6 July 1923 in Sioux Falls, South Dakota, enlisted in the Navy....
+        """
+        anything = Star(Any())
+        return Subject + Token(", born") + Object + anything
+
+
+Verifying your rules
+--------------------
+
+During the construction of your rules, you might want to check whether if the rules are matching or if they
+aren't. Even more, if you have tagged data in your corpus, you can know how good is the performance.
+
+The rules verifier is located on your instance under the ``bin`` directory, it's called ``rules_verifier.py``
+
+You can run the verifier with every rule or with a single rule, on all of the segments or in a sample of those.
+Take a look at the parameters on the rules verifier to find out how to use them by running:
+
+.. code-block:: bash
+
+    $ python bin/rules_verifier.py --help
+
+If you have labeled data on your corpus, the run will calculate how it scored in terms of precision, recall and
+other metrics. You have to keep in mind that this is not exactly what you'll get when you run the rules core, even
+if you run the verifier with all the rules and all the data, the numbers are going to be a little different because
+this will run every evidence with every rule, and the core instead stops at the first match. This is just a warning so you
+don't get too excited or too depressed with these results.

+ 2 - 0
docs/setup/requirements-base.txt

@@ -0,0 +1,2 @@
+# because of https://github.com/machinalis/iepy/issues/63
+-e .

+ 3 - 0
docs/setup/requirements-development.txt

@@ -0,0 +1,3 @@
+-r requirements-base.txt
+Sphinx==1.2.2
+pygal==1.4.6

+ 0 - 0
docs/setup/system_packages.txt


+ 4 - 0
docs/setup/third_party.txt

@@ -0,0 +1,4 @@
+There is a script to download 3rd party data in scripts/download_third_party_data.py
+Currently it downloads:
+    - The stanford POS and NES tagger
+    - punktokenizer

+ 38 - 0
docs/troubleshooting.rst

@@ -0,0 +1,38 @@
+==================
+Troubleshooting
+==================
+
+
+32 bit architecture issues
+--------------------------
+
+We've experience some memory issues when using a computer with 32 bit architecture. This is because by default we use the
+Stanford CoreNLP (java based), which has some special needs about the memory. Read about them more in detail `here <http://nlp.stanford.edu/software/tagger.shtml>`__
+
+We quote:
+
+    The system requires Java 1.8+ to be installed. Depending on whether you're running 32 or 64 bit Java and the complexity of the tagger model, you'll need somewhere between 60 and 200 MB of memory to run a trained tagger (i.e., you may need to give java an option like java -mx200m)
+
+What have worked for us is adding the following environment variable before running iepy:
+
+.. code-block:: bash
+
+    export _JAVA_OPTIONS='-Xms1024M -Xmx1024m'
+
+You can modify those numbers to your convenience.
+
+
+Preprocess not running under MacOS
+----------------------------------
+
+    Problems with the preprocess under MacOS? Apparently a change in the CoreNLP script is needed to
+    be run. You need to change the file ``corenlp.sh`` that is located on
+    ``/Users/<your user>/Library/Application Support/iepy/stanford-corenlp-full-2014-08-27/``
+    and change ``scriptdir=`dirname $0``` for ``scriptdir=`dirname "$0"``` (ie, add double quotes around ``$0``)
+
+
+Can't install IEPY with python 2
+--------------------------------
+
+  Indeed, IEPY works with Python 3.4 or higher.
+

+ 110 - 0
docs/tutorial.rst

@@ -0,0 +1,110 @@
+From 0 to IEPY
+==============
+
+In this tutorial we will guide you through the steps to create your first
+Information Extraction application with IEPY.
+Be sure you have a working :doc:`installation <installation>`.
+
+IEPY internally uses `Django <https://www.djangoproject.com/>`_ to define the database models,
+and to provide a web interface. You'll see some components of Django around the project, such as the
+configuration file (with the database definition) and the ``manage.py`` utility. If you're familiar
+with Django, you will move faster in some of the steps.
+
+
+0 - Creating an instance of IEPY
+--------------------------------
+
+To work with IEPY, you'll have to create an *instance*.
+This is going to be where the configuration, database and some binary files are stored.
+To create a new instance you have to run:
+
+.. code-block:: bash
+
+    iepy --create <project_name>
+
+Where *<project_name>* is something that you choose.
+This command will ask you a few things such as database name, its username and its password.
+When that's done, you'll have an instance in a folder with the name that you chose.
+
+Read more about the instantiation process :doc:`here <instantiation>`.
+
+
+1 - Loading the database
+------------------------
+
+The way we load the data into the database is importing it from a *csv* file. You can use the script **csv_to_iepy**
+provided in your application folder to do it.
+
+
+.. code-block:: bash
+
+    python bin/csv_to_iepy.py data.csv
+
+This will load **data.csv** into the database, from which the data will subsequently be accessed.
+
+Learn more about the required CSV file format `here <instantiation.html#csv-importer>`_.
+
+
+.. note::
+
+    You might also provide a *gziped csv file.*
+
+
+2 - Pre-processing the data
+---------------------------
+
+Once you have your database with the documents you want to analyze, you have to
+run them through the pre-processing pipeline to generate all the information needed by IEPY's core.
+
+The pre-processing pipeline runs a series of steps such as 
+text tokenization, sentence splitting, lemmatization, part-of-speech tagging,
+and named entity recognition
+
+:doc:`Read more about the pre-processing pipeline here. <preprocess>`
+
+Your IEPY application comes with code to run all the pre-processing steps.
+You can run it by doing:
+
+.. code-block:: bash
+
+    python bin/preprocess.py
+
+This *will* take a while, especially if you have a lot of data.
+
+
+
+3 - Open the web interface
+--------------------------
+
+To help you control IEPY, you have a web user interface.
+Here you can manage your database objects and label the information
+that the active learning core will need.
+
+To access the web UI, you must run the web server. Don't worry, you have everything
+that you need on your instance folder and it's as simple as running:
+
+.. code-block:: bash
+
+    python bin/manage.py runserver
+
+Leave that process running, and open up a browser at `http://127.0.0.1:8000 <http://127.0.0.1:8000>`_ to view
+the user interface home page.
+
+Now it's time for you to *create a relation definition*. Use the web interface to create the relation that you
+are going to be using.
+
+IEPY
+----
+
+Now, you're ready to run either the :doc:`active learning core <active_learning_tutorial>`
+or the :doc:`rule based core <rules_tutorial>`.
+
+
+Constructing a reference corpus
+-------------------------------
+
+To test information extraction performance, IEPY provides a tool for labeling the entire corpus "by hand"
+and the check the performance experimenting with that data.
+
+If you would like to create a labeled corpus to test the performance or for other purposes, take a look at
+the :doc:`corpus labeling tool <corpus_labeling>`

+ 19 - 0
docs/virtualenv.rst

@@ -0,0 +1,19 @@
+Virtualenv creation
+-------------------
+
+For organization sake, its strongly recommended to make all the IEPY
+installation inside a virtual python environment.
+
+We shouldn't be explaining how to create it here, so we wont.
+There is way better documentation
+`here <https://docs.python.org/3.4/library/venv.html>`__
+for python 3.4.
+
+Just make sure of have it created and activated while following the
+IEPY installation instructions.
+Some small notes before leading you to the good documentation:
+
+ - If you are working with python3.3 (or 3.4 but with the buggy ubuntu/debian release),
+   be warn that you will need to install *pip* by hand,
+   as explained `here <http://pip.readthedocs.org/en/latest/installing.html#install-pip>`__
+ - Alternatively, create your virtualenv with `virtualenvwrapper <http://virtualenvwrapper.readthedocs.org/en/latest/install.html#basic-installation>`_

+ 7 - 0
examples/birthdate/scripts/create_birthdate_relation.py

@@ -0,0 +1,7 @@
+from iepy.data.models import Relation, EntityKind
+
+
+if __name__ == "__main__":
+    person = EntityKind.objects.get_or_create(name="PERSON")[0]
+    date = EntityKind.objects.get_or_create(name="DATE")[0]
+    Relation.objects.get_or_create(name="BIRTHDATE", left_entity_kind=person, right_entity_kind=date)

+ 47 - 0
examples/birthdate/scripts/csv_to_iepy.py

@@ -0,0 +1,47 @@
+"""
+Birthdate corpus preprocessing script.
+
+Usage:
+    csv_to_iepy.py <filename>
+    csv_to_iepy.py -h | --help
+
+The <filename> argument can be a .csv file or a .csv.gz file containing the
+corpus in two columns: 'freebase_mid' and 'description'.
+
+Options:
+  -h --help             Show this screen
+"""
+import logging
+import csv
+import gzip
+import os
+
+from docopt import docopt
+
+from iepy.data.db import DocumentManager
+
+
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.INFO,
+                        format=u"%(asctime)s - %(name)s - %(levelname)s - %(message)s")
+    opts = docopt(__doc__, version=0.1)
+
+    name = opts["<filename>"]
+    if name.endswith(".gz"):
+        fin = gzip.open(name, "rt")
+    else:
+        fin = open(name, "rt")
+    reader = csv.DictReader(fin)
+    name = os.path.basename(name)
+
+    docdb = DocumentManager()
+
+    seen = set()
+    for i, d in enumerate(reader):
+        mid = d["freebase_mid"]
+        if mid in seen:
+            continue
+        seen.add(mid)
+        docdb.create_document(identifier=mid,
+                              text=d["description"],
+                              metadata={"input_filename": name})

+ 34 - 0
examples/birthdate/scripts/preprocess.py

@@ -0,0 +1,34 @@
+"""
+Birthdate corpus preprocessing script
+
+Usage:
+    preprocess.py
+    preprocess.py -h | --help | --version
+
+Options:
+  -h --help             Show this screen
+  --version             Version number
+"""
+import logging
+
+from docopt import docopt
+
+from iepy.data.db import DocumentManager
+from iepy.preprocess.stanford_preprocess import StanfordPreprocess
+from iepy.preprocess.pipeline import PreProcessPipeline
+from iepy.preprocess.segmenter import SyntacticSegmenterRunner
+
+
+if __name__ == '__main__':
+    logger = logging.getLogger(u'preprocess')
+    logger.setLevel(logging.INFO)
+    logging.basicConfig(level=logging.INFO,
+                        format=u"%(asctime)s - %(name)s - %(levelname)s - %(message)s")
+    opts = docopt(__doc__, version=0.1)
+    docs = DocumentManager()
+    pipeline = PreProcessPipeline([
+        StanfordPreprocess(),
+        SyntacticSegmenterRunner(increment=True)
+    ], docs
+    )
+    pipeline.process_everything()

+ 42 - 0
examples/birthdate/settings.py

@@ -0,0 +1,42 @@
+"""
+For more information on this file, see
+https://docs.djangoproject.com/en/1.7/topics/settings/
+
+For the full list of settings and their values, see
+https://docs.djangoproject.com/en/1.7/ref/settings/
+"""
+
+from iepy.webui.webui.settings import *
+
+IEPY_VERSION = '0.9.6'
+IEPY_LANG = 'en'
+SECRET_KEY = 'u==!fueit=wxo&j8!5u+sfasp4prjluk@*s=7!-wz_&r@pn))r'
+DEBUG = True
+TEMPLATE_DEBUG = True
+
+# Database
+# https://docs.djangoproject.com/en/1.7/ref/settings/#databases
+# DATABASES = {
+#     'default': {
+#         'ENGINE': 'django.db.backends.sqlite3',
+#         'NAME': '/home/python/luojiehua/dl_nlp/iepy-develop/examples/test/test.sqlite',
+#     }
+# }
+DATABASES = {
+    'default': {
+        'ENGINE': 'django.db.backends.postgresql_psycopg2',
+        'NAME': 'iepy',
+        'USER': 'postgres',
+        'PASSWORD': 'postgres',
+        'HOST': 'localhost',
+        'PORT': '5432'
+    }
+}
+
+# For changing tokenization options, read here.
+# http://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/process/PTBTokenizer.html
+# You can use as key any of the "known options" listed on that page, and as value,
+# use True or False (python names) for booleans, or strings when option requires a text
+# CORENLP_TKN_OPTS = {
+#     'latexQuotes': False
+# }

+ 122 - 0
examples/birthdate/was_born_rules_sample.py

@@ -0,0 +1,122 @@
+# -*- coding: utf-8 -*-
+
+from refo import Question, Star, Any, Plus
+from iepy.extraction.rules import rule, Token, Pos
+
+
+RELATION = "was born"
+
+
+@rule(True)
+def born_date_in_parenthesis(Subject, Object):
+    """
+    Ex: Gary Sykes (Born 13 February 1984) is a British super featherweight boxer.
+    """
+    anything = Star(Any())
+    born = Star(Pos(":")) + Question(Token("Born") | Token("born")) + Question(Token("c."))
+    entity_leftover = Star(Pos("NNP"))
+    return Subject + entity_leftover + Pos("-LRB-") + born + Object + Pos("-RRB-") + anything
+
+
+@rule(True)
+def born_two_dates_in_parenthesis(Subject, Object):
+    """
+    Ex: James Cunningham (born 1973 or 1974) is a Canadian stand-up comedian and TV host.
+    """
+    anything = Star(Any())
+    born = Question(Token("Born") | Token("born"))
+    entity_leftover = Star(Pos("NNP"))
+    subject = Subject + entity_leftover
+    or_object = (Object + Token("or") + Pos("CD") |
+                 Pos("CD") + Token("or") + Object)
+    return subject + Pos("-LRB-") + born + or_object + Pos("-RRB-") + anything
+
+
+@rule(True)
+def born_date_and_death_in_parenthesis(Subject, Object):
+    """
+    Ex: Carl Bridgewater (January 2, 1965 - September 19, 1978) was shot dead
+    """
+    anything = Star(Any())
+    return Subject + Pos("-LRB-") + Object + Token("-") + anything + Pos("-RRB-") + anything
+
+
+@rule(True)
+def born_date_and_place_in_parenthesis(Subject, Object):
+    """
+    Ex: Gary Sykes (Born 13 February 1984) is a British super featherweight boxer.
+    """
+    anything = Star(Any())
+    born = (Token("Born") | Token("born"))
+    entity_leftover = Star(Pos("NNP"))
+    place = Plus(Pos("NNP") + Question(Token(",")))
+    return Subject + entity_leftover + Pos("-LRB-") + born + Object + Token(",") + place + Pos("-RRB-") + anything
+
+
+@rule(True)
+def was_born_explicit_mention(Subject, Object):
+    """
+    Ex: Shamsher M. Chowdhury was born in 1950.
+    """
+    anything = Star(Any())
+    return anything + Subject + Token("was born") + Pos("IN") + Object + anything
+
+
+@rule(True)
+def is_born_in(Subject, Object):
+    """
+    Ex: Xu is born in 1902 or 1903 in a family of farmers in Hubei (China RRB)
+    """
+    anything = Star(Any())
+    return Subject + Token("is born in") + Object + anything
+
+
+@rule(True)
+def mentions_real_name(Subject, Object):
+    """
+    Ex: Harry Pilling, born Ashtonunder-Lyne, Lancashire on 2 February 1943, played ...
+    """
+    anything = Star(Any())
+    real_name = Plus(Pos("NNP") + Question(Token(",")))
+    return Subject + Token("born") + real_name + Pos("IN") + Object + anything
+
+
+@rule(True)
+def was_born_and_mentions_place(Subject, Object):
+    """
+    Ex: Nasser Sharify was born in Tehran, Iran, in 1925.
+    """
+    place = Plus(Pos("NNP") + Question(Token(",")))
+    return Subject + Token("was born") + Pos("IN") + place + Pos("IN") + Object + Question(Pos("."))
+
+
+@rule(True)
+def was_born_and_mentions_place_2(Subject, Object):
+    """
+    Ex: Theodone C. Hu was born in 1872 in Huangpu town, Haizhu District, Guangzhou, Guangdong, China.
+    """
+    anything = Star(Any())
+    place = Plus(Pos("NNP") + Question(Token(",")))
+    return Subject + Token("was born") + Pos("IN") + Object + Pos("IN") + place + anything
+
+
+@rule(True)
+def just_born(Subject, Object):
+    """
+    Ex: Lyle Eugene Hollister, born 6 July 1923 in Sioux Falls, South Dakota, enlisted in the Navy....
+    """
+    anything = Star(Any())
+    return Subject + Token(", born") + Object + anything
+
+
+## NEGATIVE RULES ##
+
+@rule(False, priority=1)
+def incorrect_labeling_of_place_as_person(Subject, Object):
+    """
+    Ex:  Sophie Christiane of Wolfstein (24 October 24, 1667 - 23 August 1737)
+    Wolfstein is a *place*, not a *person*
+    """
+    anything = Star(Any())
+    person = Plus(Pos("NNP") + Question(Token(",")))
+    return anything + person + Token("of") + Subject + anything

+ 1 - 0
examples/credit/__init__.py

@@ -0,0 +1 @@
+from . import rules

+ 49 - 0
examples/credit/annotation.conf

@@ -0,0 +1,49 @@
+# -*- Mode: Text; tab-width: 8; indent-tabs-mode: nil; coding: utf-8; -*-
+# vim:set ft=conf ts=2 sw=2 sts=2 autoindent:
+
+# Simple text-based definitions of entity, relation and event types
+# and event attributes for the BioNLP Shared Task 2011 EPI task.
+
+
+[entities]
+
+Protein
+	abc
+Entity
+
+
+[relations]
+
+Equiv	Arg1:Protein, Arg2:Protein, <REL-TYPE>:symmetric-transitive
+Equiv	Arg1:Entity, Arg2:Entity, <REL-TYPE>:symmetric-transitive
+
+# (No entity nestings permitted in EPI. Could be defined using special
+# relation type ENTITY-NESTING if necessary.)
+
+
+[events]
+
+Catalysis	Theme:<EVENT>, Cause:Protein
+----------------------------------------
+DNA_methylation|GO:0006306	Theme:Protein, Site?:Entity
+DNA_demethylation|GO:0080111	Theme:Protein, Site?:Entity
+----------------------------------------
+Acetylation|GO:0006473	Theme:Protein, Site?:Entity, Contextgene?:Protein
+Methylation|GO:0006479	Theme:Protein, Site?:Entity, Contextgene?:Protein
+Glycosylation|GO:0006486	Theme:Protein, Site?:Entity, Sidechain?:Entity
+Hydroxylation|GO:0018126	Theme:Protein, Site?:Entity
+Phosphorylation|GO:0006468	Theme:Protein, Site?:Entity
+Ubiquitination|GO:0016567	Theme:Protein, Site?:Entity
+----------------------------------------
+Deacetylation|GO:0006476	Theme:Protein, Site?:Entity, Contextgene?:Protein
+Demethylation|GO:0006482	Theme:Protein, Site?:Entity, Contextgene?:Protein
+Deglycosylation|GO:0006517	Theme:Protein, Site?:Entity, Sidechain?:Entity
+Dehydroxylation|GO:-------	Theme:Protein, Site?:Entity
+Dephosphorylation|GO:0006470	Theme:Protein, Site?:Entity
+Deubiquitination|GO:0016579	Theme:Protein, Site?:Entity
+
+
+[attributes]
+
+Negation	Arg:<EVENT>
+Speculation	Arg:<EVENT>

Fichier diff supprimé car celui-ci est trop grand
+ 7 - 0
examples/credit/articles.csv


+ 28 - 0
examples/credit/bin/csv_to_iepy.py

@@ -0,0 +1,28 @@
+"""
+IEPY database loader from csv file
+
+Usage:
+    csv_to_iepy.py <filename>
+    csv_to_iepy.py -h | --help
+
+The <filename> argument can be a .csv file or a .csv.gz file containing the
+corpus in two columns: 'freebase_mid' and 'description'.
+
+Options:
+  -h --help             Show this screen
+  --version             Version number
+"""
+
+import logging
+
+from docopt import docopt
+
+import iepy
+iepy.setup(__file__)
+from iepy.utils import csv_to_iepy
+
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.INFO, format='%(message)s')
+    opts = docopt(__doc__, version=iepy.__version__)
+    filepath = opts["<filename>"]
+    csv_to_iepy(filepath)

+ 76 - 0
examples/credit/bin/gazettes_loader.py

@@ -0,0 +1,76 @@
+"""
+IEPY gazettes loader
+
+Usage:
+    gazettes_loader.py <filename>
+
+
+The <filename> argument can be a .csv file or a .csv.gz file containing the
+gazettes in two columns: 'literal' and 'class'.
+
+
+Options:
+  -h --help             Show this screen
+"""
+
+import sys
+import csv
+import gzip
+import logging
+from operator import itemgetter
+
+from django.db import IntegrityError
+from docopt import docopt
+
+import iepy
+iepy.setup(__file__)
+from iepy.data.models import EntityKind, GazetteItem
+
+logging.basicConfig(level=logging.INFO, format='%(message)s')
+
+
+def add_gazettes_from_csv(filepath):
+    if filepath.endswith(".gz"):
+        fin = gzip.open(filepath, "rt")
+    else:
+        fin = open(filepath, "rt")
+    reader = csv.DictReader(fin)
+
+    expected_fnames = ['literal', 'class']
+    if not set(reader.fieldnames).issuperset(expected_fnames):
+        msg = "Couldn't find the expected field names on the provided csv: {}"
+        sys.exit(msg.format(expected_fnames))
+
+    _create_gazette_entries(
+        itemgetter(*expected_fnames)(line) for line in reader
+    )
+
+
+def _create_gazette_entries(entries_list):
+    kind_cache = {}
+    created = 0
+    for literal, kind_name in entries_list:
+        literal = literal.strip()
+        kind_name = kind_name.strip()
+        kind = kind_cache.get(kind_name)
+        if kind is None:
+            kind, _ = EntityKind.objects.get_or_create(name=kind_name)
+            kind_cache[kind_name] = kind
+        gazette = GazetteItem(text=literal, kind=kind)
+
+        try:
+            gazette.save()
+        except IntegrityError as error:
+            logging.warn(
+                "Gazette '{}' of class '{}' not loaded, literal already existed".format(
+                literal, kind_name))
+            print(error)
+        finally:
+            created += 1
+    print('Created {} new gazette items'.format(created))
+
+
+if __name__ == "__main__":
+    opts = docopt(__doc__, version=iepy.__version__)
+    fname = opts["<filename>"]
+    add_gazettes_from_csv(fname)

+ 59 - 0
examples/credit/bin/iepy_rules_runner.py

@@ -0,0 +1,59 @@
+"""
+Run IEPY rule-based extractor
+
+Usage:
+    iepy_rules_runner.py
+    iepy_rules_runner.py -h | --help | --version
+
+Picks from rules.py the relation to work with, and the rules definitions and
+proceeds with the extraction.
+
+Options:
+  -h --help             Show this screen
+  --version             Version number
+"""
+import sys
+import logging
+
+from django.core.exceptions import ObjectDoesNotExist
+
+import iepy
+iepy.setup(__file__)
+
+from iepy.extraction.rules import load_rules
+from iepy.extraction.rules_core import RuleBasedCore
+from iepy.data import models, output
+from iepy.data.db import CandidateEvidenceManager
+
+
+def run_from_command_line():
+    logging.basicConfig(level=logging.INFO, format='%(message)s')
+
+    try:
+        relation_name = iepy.instance.rules.RELATION
+    except AttributeError:
+        logging.error("RELATION not defined in rules file")
+        sys.exit(1)
+
+    try:
+        relation = models.Relation.objects.get(name=relation_name)
+    except ObjectDoesNotExist:
+        logging.error("Relation {!r} not found".format(relation_name))
+        sys.exit(1)
+
+    # Load rules
+    rules = load_rules()
+
+    # Load evidences
+    evidences = CandidateEvidenceManager.candidates_for_relation(relation)
+
+    # Run the pipeline
+    iextractor = RuleBasedCore(relation, rules)
+    iextractor.start()
+    iextractor.process()
+    predictions = iextractor.predict(evidences)
+    output.dump_output_loop(predictions)
+
+
+if __name__ == u'__main__':
+    run_from_command_line()

+ 184 - 0
examples/credit/bin/iepy_runner.py

@@ -0,0 +1,184 @@
+"""
+Run IEPY active-learning extractor
+
+Usage:
+    iepy_runner.py [options] <relation_name> <output>
+    iepy_runner.py [options] --db-store <relation_name>
+    iepy_runner.py -h | --help | --version
+
+Options:
+  --store-extractor=<extractor_output>     Stores the trained classifier
+  --trained-extractor=<extractor_path>     Load an already trained extractor
+  --db-store                               Stores the predictions on the database
+  --no-questions                           Won't generate questions to answer. Will predict
+                                           as is. Should be used with --trained-extractor
+  --tune-for=<tune-for>                    Predictions tuning. Options are high-prec
+                                           or high-recall [default: high-prec]
+  --extractor-config=<config.json>         Sets the extractor config
+  --version                                Version number
+  -h --help                                Show this screen
+"""
+
+import os
+import json
+import logging
+from docopt import docopt
+from sys import exit
+
+import iepy
+INSTANCE_PATH = iepy.setup(__file__)
+
+from iepy.extraction.active_learning_core import ActiveLearningCore, HIPREC, HIREC
+from iepy.data.db import CandidateEvidenceManager
+from iepy.data.models import Relation
+from iepy.extraction.terminal import TerminalAdministration
+from iepy.data import output
+
+
+def print_all_relations():
+    print("All available relations:")
+    for relation in Relation.objects.all():
+        print("  {}".format(relation))
+
+
+def load_labeled_evidences(relation, evidences):
+    CEM = CandidateEvidenceManager  # shorcut
+    return CEM.labels_for(relation, evidences, CEM.conflict_resolution_newest_wins)
+
+
+def _get_tuning_mode(opts):
+    if opts['--tune-for'] == 'high-prec':
+        tuning_mode = HIPREC
+    elif opts['--tune-for'] == 'high-recall':
+        tuning_mode = HIREC
+    else:
+        print ('Invalid tuning mode')
+        print (__doc__)
+        exit(1)
+    return tuning_mode
+
+
+def _get_relation(opts):
+    relation_name = opts['<relation_name>']
+    try:
+        relation = Relation.objects.get(name=relation_name)
+    except Relation.DoesNotExist:
+        print("Relation {!r} non existent".format(relation_name))
+        print_all_relations()
+        exit(1)
+    return relation
+
+
+def _load_extractor(opts, relation, labeled_evidences):
+    extractor_path = opts.get('--trained-extractor')
+    try:
+        iextractor = ActiveLearningCore.load(extractor_path,
+                                             labeled_evidences=labeled_evidences)
+    except ValueError:
+        print("Error: unable to load extractor, invalid file")
+        exit(1)
+
+    if iextractor.relation != relation:
+        print('The loaded extractor is not for the requested relation'
+              ' but for relation {} instead'.format(iextractor.relation))
+        exit(1)
+    print('Extractor successfully loaded')
+    return iextractor
+
+
+def _construct_extractor(opts, relation, labeled_evidences, tuning_mode):
+    config_filepath = opts.get("--extractor-config")
+    if not config_filepath:
+        config_filepath = os.path.join(INSTANCE_PATH, "extractor_config.json")
+
+    if not os.path.exists(config_filepath):
+        print("Error: extractor config does not exists, please create the "
+              "file extractor_config.json or use the --extractor-config")
+        exit(1)
+
+    with open(config_filepath) as filehandler:
+        try:
+            extractor_config = json.load(filehandler)
+        except Exception as error:
+            print("Error: unable to load extractor config: {}".format(error))
+            exit(1)
+
+    iextractor = ActiveLearningCore(
+        relation, labeled_evidences, extractor_config, tradeoff=tuning_mode
+    )
+    return iextractor
+
+
+def run_from_command_line():
+    opts = docopt(__doc__, version=iepy.__version__)
+
+    logging.basicConfig(level=logging.INFO, format='%(message)s')
+    logging.getLogger("featureforge").setLevel(logging.WARN)
+
+    tuning_mode = _get_tuning_mode(opts)
+    relation = _get_relation(opts)
+
+    candidates = CandidateEvidenceManager.candidates_for_relation(relation)
+    labeled_evidences = load_labeled_evidences(relation, candidates)
+
+    if opts.get('--trained-extractor'):
+        iextractor = _load_extractor(opts, relation, labeled_evidences)
+        was_ever_trained = True
+        opts["--no-questions"] = True
+    else:
+        iextractor = _construct_extractor(opts, relation, labeled_evidences, tuning_mode)
+        iextractor.start()
+        was_ever_trained = False
+
+    if not opts.get("--no-questions", False):
+        questions_loop(iextractor, relation, was_ever_trained)
+
+    # Candidates generator was consumed when generating labeled_evidences, so we'll
+    # define it fresh again
+    candidates = CandidateEvidenceManager.candidates_for_relation(relation)
+    # Predict and store output
+    predictions = iextractor.predict(candidates)  # asking predictions for EVERYTHING
+    if not predictions:
+        print("Nothing was predicted")
+        exit(1)
+
+    if opts.get("--db-store"):
+        output.dump_predictions_to_database(relation, predictions)
+
+    output_file = opts.get("<output>")
+    if output_file:
+        output.dump_runner_output_to_csv(predictions, output_file)
+
+    classifier_output = opts.get("--store-extractor")
+    if classifier_output:
+        iextractor.save(classifier_output)
+
+
+def questions_loop(iextractor, relation, was_ever_trained):
+    STOP = u'STOP'
+    term = TerminalAdministration(
+        relation,
+        extra_options=[(STOP, u'Stop execution')]
+    )
+    while iextractor.questions:
+        questions = list(iextractor.questions)  # copying the list
+        term.update_candidate_evidences_to_label(questions)
+        result = term()
+        i = 0
+        for c, label_value in load_labeled_evidences(relation, questions).items():
+            if label_value is not None:
+                iextractor.add_answer(c, label_value)
+                i += 1
+        print ('Added %s new human labels to the extractor core' % i)
+        iextractor.process()
+        was_ever_trained = True
+        if result == STOP:
+            break
+
+    if not was_ever_trained:
+        # It's needed to run some process before asking for predictions
+        iextractor.process()
+
+
+if __name__ == u'__main__':
+    run_from_command_line()

+ 12 - 0
examples/credit/bin/manage.py

@@ -0,0 +1,12 @@
+#!/usr/bin/env python
+
+import sys
+
+from django.core.management import execute_from_command_line
+
+import iepy
+iepy.setup(__file__)
+
+
+if __name__ == "__main__":
+    execute_from_command_line(sys.argv)

+ 96 - 0
examples/credit/bin/preprocess.py

@@ -0,0 +1,96 @@
+"""
+Corpus preprocessing script
+
+Usage:
+    preprocess.py [options]
+    preprocess.py --split-in=<num-splits> --run-part=<num-part>
+    preprocess.py --increment-ner
+    preprocess.py -h | --help | --version
+
+Options:
+  -h --help                      Show this screen
+  --multiple-cores=<num-cores>   Number of cores (use all to use every processor)
+  --increment-ner                Re run NER and Gazetter for every document. If a document lacked any of the previous steps, will be preprocessed entirely.
+  --version                      Version number
+"""
+import logging
+
+from docopt import docopt
+
+import os
+os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
+os.environ["CUDA_VISIBLE_DEVICES"] = ""
+
+import iepy
+import multiprocessing
+iepy.setup(__file__)
+from iepy.data.db import DocumentManager
+from iepy.selfpreprocess.self_preprocess import SelfPreprocesser
+from iepy.selfpreprocess.pipeline import PreProcessPipeline, PreProcessSteps
+# from iepy.preprocess.stanford_preprocess import StanfordPreprocess
+# from iepy.preprocess.pipeline import PreProcessPipeline, PreProcessSteps
+# from iepy.preprocess.segmenter import SyntacticSegmenterRunner
+
+
+
+
+class ParallelDocManager(DocumentManager):
+
+    def mines_of(self, qset, number_of_processors, my_id):
+        K = number_of_processors
+        N = my_id
+        clause = 'id %%%% %s = %s' % (K, N)
+        return qset.extra(where=[clause])
+
+def start_preprocess(docs, increment_ner):
+    pipeline = PreProcessPipeline([
+        SelfPreprocesser(increment_ner),
+        # SyntacticSegmenterRunner(increment=True)
+    ], docs)
+    pipeline.process_everything()
+
+if __name__ == '__main__':
+    logger = logging.getLogger(u'preprocess')
+    logger.setLevel(logging.INFO)
+    logging.basicConfig(level=logging.INFO, format='%(message)s')
+    opts = docopt(__doc__, version=iepy.__version__)
+    increment_ner = opts['--increment-ner']
+
+    dm = ParallelDocManager()
+    all_docs = dm.get_documents_lacking_preprocess(
+        [PreProcessSteps.brat])
+
+    multiple_cores = opts.get('--multiple-cores')
+    split_in = opts.get("--split-in")
+    run_part = opts.get("--run-part")
+
+    if multiple_cores:
+        if multiple_cores == "all":
+            multiple_cores = multiprocessing.cpu_count()
+        try:
+            multiple_cores = int(multiple_cores)
+        except ValueError:
+            logger.error("Invalid number of cores")
+            exit(1)
+
+        for i in range(multiple_cores):
+            process = multiprocessing.Process(
+                target=start_preprocess, args=(dm.mines_of(all_docs, multiple_cores, i), increment_ner)
+            )
+            process.start()
+    elif split_in:
+        try:
+            split_in = int(split_in)
+            run_part = int(run_part) - 1
+        except ValueError:
+            logger.error("Invalid split")
+            exit(1)
+
+        if run_part < 0 or run_part > split_in:
+            logger.error("Parts must be between 1 and {}".format(split_in))
+            exit(1)
+
+        docs = dm.mines_of(all_docs, split_in, run_part)
+        start_preprocess(docs, increment_ner)
+    else:
+        start_preprocess(all_docs, increment_ner)

+ 149 - 0
examples/credit/bin/rules_verifier.py

@@ -0,0 +1,149 @@
+"""
+IEPY rules verifier
+
+
+Usage:
+    rules_verifier.py <relation> [options]
+
+Options:
+  --shuffle             Chooses the sample randomly and not the first ones
+  --create-evidences    Creates evidences that are missing [default: false]
+  -r --rule=<rule>      Tests only this rule
+  -l --limit=<limit>    Limits the amount of evidences uses
+  -h --help             Show this screen
+"""
+
+import sys
+import logging
+from docopt import docopt
+
+import refo
+from django.core.exceptions import ObjectDoesNotExist
+from colorama import init as colorama_init
+
+import iepy
+iepy.setup(__file__)
+
+from iepy.data import models
+from iepy.data.models import EvidenceCandidate
+from iepy.data.db import CandidateEvidenceManager
+from iepy.extraction.terminal import TerminalEvidenceFormatter
+from iepy.extraction.rules import (
+    load_rules, compile_rule, generate_tokens_to_match
+)
+from iepy.metrics import result_dict_from_predictions
+
+
+logging.basicConfig(level=logging.INFO, format='%(message)s')
+
+
+def run_from_command_line():
+    opts = docopt(__doc__, version=iepy.__version__)
+    relation_name = opts.get("<relation>")
+    limit = opts.get("--limit")
+    rule_name = opts.get("--rule")
+    shuffle = opts.get("--shuffle")
+    create_evidences = opts.get("--create-evidences")
+
+    if limit is None:
+        limit = -1
+
+    try:
+        limit = int(limit)
+    except ValueError:
+        logging.error("Invalid limit value, it must be a number")
+        sys.exit(1)
+
+    try:
+        relation = models.Relation.objects.get(name=relation_name)
+    except ObjectDoesNotExist:
+        logging.error("Relation {!r} not found".format(relation_name))
+        sys.exit(1)
+
+    # Load rules
+    rules = get_rules(rule_name)
+    rule_regexes = [
+        (rule.__name__, compile_rule(rule, relation), rule.answer) for rule in rules
+    ]
+
+    # Load evidences
+    if EvidenceCandidate.objects.all().count() == 0:
+        create_evidences = True
+    evidences = CandidateEvidenceManager.candidates_for_relation(
+        relation, create_evidences, seg_limit=limit, shuffle_segs=shuffle
+    )
+    conflict_solver = CandidateEvidenceManager.conflict_resolution_newest_wins
+    answers = CandidateEvidenceManager.labels_for(
+        relation, evidences, conflict_solver
+    )
+    run_tests(rule_regexes, evidences, answers)
+
+
+def run_tests(rule_regexes, evidences, answers):
+    predictions = []
+    real_labels = []
+    evidences_with_labels = []
+
+    colorama_init()
+    formatter = TerminalEvidenceFormatter()
+
+    for name, regex, answer in rule_regexes:
+        title = "Matches for rule '{}' (value: {})".format(name, answer)
+        print("\n{}\n{}".format(title, "-" * len(title)))
+
+        anything_matched = False
+        for evidence in evidences:
+            tokens_to_match = generate_tokens_to_match(evidence)
+            match = refo.match(regex, tokens_to_match)
+
+            if match:
+                anything_matched = True
+                print("  * {}".format(formatter.colored_text(evidence)))
+
+            if evidence in answers and answers[evidence] is not None:
+                evidences_with_labels.append(evidence)
+                real_labels.append(answers[evidence])
+
+                if match:
+                    predictions.append(answer)
+                else:
+                    predictions.append(False)
+
+        if not anything_matched:
+            print("  nothing matched")
+
+        print()
+
+    if real_labels:
+        results = result_dict_from_predictions(
+            evidences_with_labels, real_labels, predictions
+        )
+        results.pop("end_time")
+        keys = [
+            "true_positives", "true_negatives",
+            "false_positives", "false_negatives",
+            "precision", "recall",
+            "accuracy", "f1",
+        ]
+
+        title = "Metrics"
+        print("{}\n{}".format(title, "-" * len(title)))
+        for key in keys:
+            print("{:>15}: {:.2f}".format(key, results[key]))
+
+
+def get_rules(rule_name):
+    # Load rules
+    rules = load_rules()
+
+    if rule_name:
+        rules = [x for x in rules if x.__name__ == rule_name]
+        if not rules:
+            logging.error("rule '{}' does not exists".format(rule_name))
+            sys.exit(1)
+
+    return rules
+
+
+if __name__ == "__main__":
+    run_from_command_line()

+ 241 - 0
examples/credit/bin/settlement.py

@@ -0,0 +1,241 @@
+
+
+
+
+from django.db.models import Q
+import datetime,time
+import iepy
+iepy.setup(__file__)
+from iepy.data.db import DocumentManager
+from iepy.data.models import IEDocument,LabeledIEDocument,IEDocumentMetadata,LabeledIEDocumentMetadata,Payroll
+from brat.models import BratAnnotation,LabeledBratAnnotation
+from django.db import transaction
+import pandas as pd
+from django.contrib.auth.models import User
+
+def object_to_dict(obj,class_model):
+    '''
+    :param obj:对象
+    :param class_model:django model
+    :return: 由对象生成的键值对
+    '''
+    _dict = {}
+    concrete_model = class_model._meta.concrete_model
+    for field in concrete_model._meta.local_fields:
+        value = field.value_from_object(obj)
+        _dict[field.name] = value
+    return _dict
+
+
+class Settlement():
+
+    '''
+    @summary: 结算类,定义了结算者所需要执行的各种方法
+    '''
+
+    def makePayroll(self,_user,time_begin,time_end):
+        '''
+        :param _user: 用户名
+        :param time_begin: 起始时间
+        :param time_end: 截至时间
+        :return:根据用户,时间段生成用户的标注情况
+        '''
+        from django.db import connection
+        with transaction.atomic():
+            cursor = connection.cursor()
+            sql = " select count(1) from corpus_iedocument where edituser='%s' and to_char(edittime,'yyyy-mm-dd')<='%s' and to_char(edittime,'yyyy-mm-dd')>='%s'"%(_user,time_end,time_begin)
+            cursor.execute(sql)
+            doc_count = cursor.fetchall()[0][0]
+            sql = " select count(1) from brat_bratannotation where document_id in (select human_identifier from corpus_iedocument where edituser='%s' and to_char(edittime,'yyyy-mm-dd')<='%s' and to_char(edittime,'yyyy-mm-dd')>='%s' order by edittime desc limit 1200) and value like '%s' "%(_user,time_end,time_begin,"T%")
+            cursor.execute(sql)
+            t_count = cursor.fetchall()[0][0]
+            sql = " select count(1) from brat_bratannotation where document_id in (select human_identifier from corpus_iedocument where edituser='%s' and to_char(edittime,'yyyy-mm-dd')<='%s' and to_char(edittime,'yyyy-mm-dd')>='%s' order by edittime desc limit 1200) and value like '%s' "%(_user,time_end,time_begin,"R%")
+            cursor.execute(sql)
+            r_count = cursor.fetchall()[0][0]
+            sql = " select count(1) from brat_bratannotation where document_id in (select human_identifier from corpus_iedocument where edituser='%s' and to_char(edittime,'yyyy-mm-dd')<='%s' and to_char(edittime,'yyyy-mm-dd')>='%s') and value like '%s' "%(_user,time_end,time_begin,"T%")
+            cursor.execute(sql)
+            all_t_count = cursor.fetchall()[0][0]
+            sql = " select count(1) from brat_bratannotation where document_id in (select human_identifier from corpus_iedocument where edituser='%s' and to_char(edittime,'yyyy-mm-dd')<='%s' and to_char(edittime,'yyyy-mm-dd')>='%s') and value like '%s' "%(_user,time_end,time_begin,"R%")
+            cursor.execute(sql)
+            all_r_count = cursor.fetchall()[0][0]
+            wage = round(0.03*t_count+0.05*r_count+(all_t_count-t_count)*0.04+(all_r_count-r_count)*0.06,2)
+            print(doc_count,t_count,r_count,wage)
+            payrolls = Payroll.objects.filter(Q(user=_user)& Q(begin_time=time_begin) & Q(end_time=time_end))
+            if len(payrolls)==0:
+                _payroll = Payroll.objects.create(**{"user":_user,"doc_count":doc_count,"begin_time":time_begin,"end_time":time_end,"t_count":all_t_count,"r_count":all_r_count,"wage":wage})
+                _payroll.save()
+            else:
+                _payroll = payrolls[0]
+                _payroll.doc_count = doc_count
+                _payroll.t_count = all_t_count
+                _payroll.r_count = all_r_count
+                _payroll.wage = wage
+                _payroll.save()
+
+    def exportPayroll(self,begin_time,end_time):
+        '''
+        :param begin_time: 导出开始时间
+        :param end_time: 导出结束时间
+        :return:
+        '''
+        list_user = []
+        list_doc_count = []
+        list_t_count = []
+        list_r_count = []
+        list_wage = []
+        list_yield = []
+        list_account = []
+        list_begin = []
+        list_end = []
+        if begin_time is not None:
+            payrolls = Payroll.objects.filter(Q(begin_time=begin_time) & Q(end_time=end_time))
+        else:
+            payrolls = Payroll.objects.filter(Q(end_time=end_time))
+        for _payroll in payrolls:
+            list_user.append(_payroll.user)
+            list_doc_count.append(_payroll.doc_count)
+            list_t_count.append(_payroll.t_count)
+            list_r_count.append(_payroll.r_count)
+            list_wage.append(_payroll.wage)
+            list_yield.append(_payroll._yield)
+            list_account.append(_payroll.account)
+            list_begin.append(_payroll.begin_time)
+            list_end.append(_payroll.end_time)
+        df = pd.DataFrame({"用户":list_user,"开始时间":list_begin,"结束时间":list_end,"文章数":list_doc_count,"要素数":list_t_count,"关系数":list_r_count,"总价":list_wage,"合格率":list_yield,"结算价":list_account})
+        df.to_excel("%s-%s要素标注统计.xls"%(begin_time,end_time),columns=["用户","开始时间","结束时间","文章数","要素数","关系数","总价","合格率","结算价"])
+
+    def getAllUser(self):
+        from django.db import connection
+        with transaction.atomic():
+            list_user = []
+            cursor = connection.cursor()
+            sql = "select username from auth_user where is_staff='t'"
+            cursor.execute(sql)
+            for row in cursor.fetchall():
+                list_user.append(row[0])
+            return list_user
+
+
+    def makeMigrate(self,_user,time_begin,time_end):
+        '''
+        :param _user: 用户名
+        :param time_begin: 起始时间
+        :param time_end: 截至时间
+        :return: 将用户在时间段内的数据迁移到标准表中
+        '''
+        pass
+        # from django.db import connection
+        # with transaction.atomic():
+        #     cursor = connection.cursor()
+        #     sql = " select human_identifier,offsets_to_text,sentences from corpus_iedocument where edituser is null"
+        #     cursor.execute(sql)
+        #     cursor1 = connection.cursor()
+        #     _index = 0
+        #     rows = True
+        #     while(rows):
+        #         rows=cursor.fetchmany(1000)
+        #         for row in rows:
+        #             _index += 1
+        #             print(_index)
+        #             human_identifier,offsets_to_text,sentences = row
+        #             if sentences!="[]":
+        #                 _off = offsets_to_text.split(", ")[-1][:-1]
+        #                 _sen = sentences.split(", ")[-1][:-1]
+        #                 print(_off,_sen)
+        #                 if int(_off)!=int(_sen):
+        #                     offsets_to_text = offsets_to_text[:-1]+", "+str(int(_sen))+"]"
+        #                     print(offsets_to_text)
+        #                     cursor1.execute("update corpus_iedocument set offsets_to_text='%s' where human_identifier='%s'"%(offsets_to_text,human_identifier))
+
+
+
+            # ieDocuments = IEDocument.objects.filter(Q(edituser=_user) & Q(edittime__range=(time_begin,time_end)))
+            # for obj in ieDocuments:
+            #     _dict = object_to_dict(obj,IEDocument)
+            #     _dict_meta = object_to_dict(obj.metadata,IEDocumentMetadata)
+            #     labeledMeta = LabeledIEDocumentMetadata.objects.create(**_dict_meta)
+            #     labeledMeta.save()
+            #     _dict["metadata"] = labeledMeta
+            #     tmp = LabeledIEDocument.objects.create(**_dict)
+            #     tmp.save()
+            #
+            #     bratAnnotations = BratAnnotation.objects.filter(Q(document_id=obj.human_identifier))
+            #     for ann in bratAnnotations:
+            #         _dict_ann = object_to_dict(ann,BratAnnotation)
+            #         labeledAnn = LabeledBratAnnotation.objects.create(**_dict_ann)
+            #         labeledAnn.save()
+
+
+    def getPercentOfPass(self,_user,time_begin,time_end):
+        '''
+        :param _user:用户名
+        :param time_begin: 起始时间
+        :param time_end: 截至时间
+        :return: 获得用户在时间段内标注数据的合格率
+        '''
+
+    def makePayrolls(self,time_begin,time_end):
+        '''
+        :param time_begin:起始时间
+        :param time_end: 截至时间
+        :return: 获得所有用户的工资表
+        '''
+        for _user in self.getAllUser():
+            self.makePayroll(_user,time_begin,time_end)
+        self.exportPayroll(time_begin,time_end)
+
+    def createUser_batch(self,batch_size=90):
+        '''
+        :param batch_size: 用户个数
+        :return:
+        '''
+        list_user = [User.objects.create_user(username="bidi%d"%(i+1),password="bidi%d"%(i+1)) for i in range(batch_size)]
+
+    def exportLabels(self):
+        groups = [[1,7],[8,14],[15,22],[23,29],[30,36],[37,43],[44,50],[51,56],[57,62],[63,71]]
+        from django.db import connection
+        cursor = connection.cursor()
+        for _i in range(len(groups)):
+            _begin,_end = groups[_i]
+            list_username = []
+            list_user = []
+            list_label = []
+            list_time = []
+            for _j in range(_begin,_end+1):
+                username = "bidi%d"%_j
+                list_username.append("'%s'"%username)
+            sql = " select edituser,human_identifier,to_char(edittime,'yyyy-mm-dd') from corpus_iedocument where edituser in(%s) order by edittime asc"%(",".join(list_username))
+            print(sql)
+            cursor.execute(sql)
+            rows = cursor.fetchall()
+            for row in rows:
+                list_user.append(row[0])
+                list_label.append(row[1])
+                list_time.append(row[2])
+            df = pd.DataFrame({"时间":list_time,"用户":list_user,"文章编号":list_label})
+            df.to_excel("分组_%d.xls"%(_i+1),columns=["时间","用户","文章编号"])
+
+    def filter(self):
+        '''
+        过滤拍卖公告
+        :return:
+        '''
+        import re
+        ieDocuments = IEDocument.objects.all()
+        for obj in ieDocuments:
+            if re.search("拍卖",obj.text) is not None:
+                obj.jump_signal = 1
+                obj.save()
+                print(obj.human_identifier)
+
+
+
+if __name__=="__main__":
+    settle = Settlement()
+    # settle.makeMigrate("test","2020-08-01","2020-08-31")
+    # settle.makePayroll("test17","2020-08-01","2020-10-31")
+    # settle.makePayrolls("2020-08-01","2020-08-31")
+    settle.exportPayroll(begin_time=None,end_time='2020-10-31')
+    # settle.createUser_batch(batch_size=102)
+    # settle.exportLabels()
+    # settle.filter()

+ 20 - 0
examples/credit/extractor_config.json

@@ -0,0 +1,20 @@
+{
+    "sparse_features": [
+        "bag_of_words",
+        "bag_of_pos",
+        "bag_of_words_in_between",
+        "bag_of_pos_in_between"
+    ],
+    "dense_features": [
+        "entity_order",
+        "entity_distance",
+        "other_entities_in_between",
+        "verbs_count_in_between",
+        "verbs_count",
+        "total_number_of_entities",
+        "symbols_in_between",
+        "number_of_tokens"
+    ],
+    "classifier_args": {},
+    "classifier": "svc"
+}

+ 6 - 0
examples/credit/format.py

@@ -0,0 +1,6 @@
+import time
+a = [1462636800, 1606492800]
+print(time.time()-86400*4)
+
+for item in a:
+    print(time.strftime('%Y-%m-%d',time.localtime(1606377029)))

+ 2 - 0
examples/credit/rules.py

@@ -0,0 +1,2 @@
+# Write here your rules
+# RELATION = 'your relation here'

+ 182 - 0
examples/credit/settings.py

@@ -0,0 +1,182 @@
+"""
+For more information on this file, see
+https://docs.djangoproject.com/en/1.7/topics/settings/
+
+For the full list of settings and their values, see
+https://docs.djangoproject.com/en/1.7/ref/settings/
+"""
+
+from iepy.webui.webui.settings import *
+from django.conf import settings
+
+IEPY_VERSION = '0.9.6'
+IEPY_LANG = 'en'
+SECRET_KEY = 'u==!fueit=wxo&j8!5u+sfasp4prjluk@*s=7!-wz_&r@pn))r'
+DEBUG = True
+TEMPLATE_DEBUG = True
+
+# Database
+# https://docs.djangoproject.com/en/1.7/ref/settings/#databases
+# DATABASES = {
+#     'default': {
+#         'ENGINE': 'django.db.backends.sqlite3',
+#         'NAME': '/home/python/luojiehua/dl_nlp/iepy-develop/examples/test/test.sqlite',
+#     }
+# }
+DATABASES = {
+    'default': {
+        'ENGINE': 'django.db.backends.postgresql_psycopg2',
+        'NAME': 'iepy_credit',
+        'USER': 'postgres',
+        'PASSWORD': 'postgres',
+        'HOST': '192.168.2.101',
+        'PORT': '5432'
+    }
+}
+
+# For changing tokenization options, read here.
+# http://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/process/PTBTokenizer.html
+# You can use as key any of the "known options" listed on that page, and as value,
+# use True or False (python names) for booleans, or strings when option requires a text
+# CORENLP_TKN_OPTS = {
+#     'latexQuotes': False
+# }
+
+#default brat settings
+CONFIG_BRAT = {
+    "acl.conf":"""
+User-agent: *
+Allow: /
+Disallow: /hidden/
+
+User-agent: guest
+Disallow: /confidential/
+""",
+    "annotation.conf":"""
+[spans]
+punishment_code
+code
+name
+money
+	money_tendereeMoney
+	money_tendererMoney
+org
+	org_tenderee
+	org_agency
+	org_tenderer
+	org_secondTenderer
+	org_thirdTenderer
+company
+	company_tenderee
+	company_agency
+	company_tenderer
+	company_secondTenderer
+	company_thirdTenderer
+job
+person
+	person_tendereePerson
+	person_agencyPerson
+	person_person
+	person_review
+time
+	time_release
+	time_bidopen
+	time_bidclose
+location
+package
+phone
+moneysource
+bidway
+serviceTime
+[relations]
+Equiv	Arg1:org|company|org_tenderee|org_agency|org_tenderer|org_secondTenderer|org_thirdTenderer|company_tenderee|company_agency|company_tenderer|company_secondTenderer|company_thirdTenderer, Arg2:org|company|org_tenderee|org_agency|org_tenderer|org_secondTenderer|org_thirdTenderer|company_tenderee|company_agency|company_tenderer|company_secondTenderer|company_thirdTenderer, <REL-TYPE>:symmetric-transitive
+rel_tendererMoney	Arg1:org_tenderer|org_secondTenderer|org_thirdTenderer|company_tenderer|company_secondTenderer|company_thirdTenderer|org|company, Arg2:money_tendererMoney
+rel_tendereeMoney	Arg1:package, Arg2:money_tendereeMoney|money
+rel_person	Arg1:org|company|org_tenderee|org_agency|org_tenderer|org_secondTenderer|org_thirdTenderer|company_tenderee|company_agency|company_tenderer|company_secondTenderer|company_thirdTenderer, Arg2:person_tendereePerson|person_agencyPerson|person_person
+rel_pack	Arg1:org_tenderer|org_secondTenderer|org_thirdTenderer|company_tenderer|company_secondTenderer|company_thirdTenderer, Arg2:package
+rel_address	Arg1:org|company|org_tenderee|org_agency|org_tenderer|org_secondTenderer|org_thirdTenderer|company_tenderee|company_agency|company_tenderer|company_secondTenderer|company_thirdTenderer, Arg2:location
+rel_phone	Arg1:person_tendereePerson|person_agencyPerson|person_person, Arg2:phone
+rel_pack_code	Arg1:package, Arg2:code
+rel_pack_name	Arg1:package, Arg2:name
+
+
+[events]
+#Protein_binding|GO:0005515	Theme+:Protein
+#Gene_expression|GO:0010467	Theme:Protein
+
+[attributes]
+#att_role	Arg:<ENTITY>, Value:招标人|代理人|中标人|第二候选人|第三候选人|att_noRole
+#att_role	Arg:<ENTITY>, Value:att_tenderee|att_agency|att_tenderer|att_secondTenderer|att_thirdTenderer|att_noRole
+#att_money	Arg:<ENTITY>, Value:att_tendereeMoney|att_tendererMoney|att_nomoney
+#att_person	Arg:<ENTITY>, Value:att_noperson|att_tendereePerson|att_agencyPerson|att_person
+#Negation	Arg:<EVENT>
+#Speculation	Arg:<EVENT>
+""",
+    "visual.conf":"""
+[labels]
+punishment_code | 处罚编号
+code | 项目编号
+name | 项目名称
+org | 组织
+company | 公司
+job | 职业
+person | 人名
+time | 时间
+location | 地址
+package | 包号
+phone | 电话
+money | 金额
+money_tendereeMoney | 招标金额
+money_tendererMoney | 中投标金额
+
+org_tenderee | 招标人
+org_agency | 代理人
+org_tenderer | 中标人
+org_secondTenderer | 第二候选人
+org_thirdTenderer | 第三候选人
+company_tenderee | 招标人
+company_agency | 代理人
+company_tenderer | 中标人
+company_secondTenderer | 第二候选人
+company_thirdTenderer | 第三候选人
+
+person_tendereePerson | 招标联系人
+person_agencyPerson | 代理联系人
+person_person | 联系人
+
+rel_tendererMoney | 中投标金额
+rel_tendereeMoney | 招标金额
+rel_person | 联系人
+rel_pack | 所属包
+rel_address | 地址
+rel_phone | 联系电话
+rel_pack_code | 包件编号
+rel_pack_name | 包件名称
+
+person_review | 评审专家
+time_release | 发布时间
+time_bidopen | 开标时间
+time_bidclose | 截标时间
+moneysource | 资金来源
+bidway | 招标方式
+serviceTime | 服务期限
+
+#Protein | Protein | Pro | P
+#Protein_binding | Protein binding | Binding | Bind
+#Gene_expression | Gene expression | Expression | Exp
+#Theme | Theme | Th
+
+[drawing]
+Protein	bgColor:#7fa2ff
+SPAN_DEFAULT	fgColor:black, bgColor:lightgreen, borderColor:black
+ARC_DEFAULT	color:black
+ATTRIBUTE_DEFAULT	glyph:*
+""",
+    "tools.conf":"""
+[search]
+google     <URL>:http://www.google.com/search?q=%s
+""",
+    "kb_shortcuts.conf":"""
+P	Protein
+"""
+}

BIN
examples/credit/test.sqlite


+ 1 - 0
examples/product/__init__.py

@@ -0,0 +1 @@
+from . import rules

+ 49 - 0
examples/product/annotation.conf

@@ -0,0 +1,49 @@
+# -*- Mode: Text; tab-width: 8; indent-tabs-mode: nil; coding: utf-8; -*-
+# vim:set ft=conf ts=2 sw=2 sts=2 autoindent:
+
+# Simple text-based definitions of entity, relation and event types
+# and event attributes for the BioNLP Shared Task 2011 EPI task.
+
+
+[entities]
+
+Protein
+	abc
+Entity
+
+
+[relations]
+
+Equiv	Arg1:Protein, Arg2:Protein, <REL-TYPE>:symmetric-transitive
+Equiv	Arg1:Entity, Arg2:Entity, <REL-TYPE>:symmetric-transitive
+
+# (No entity nestings permitted in EPI. Could be defined using special
+# relation type ENTITY-NESTING if necessary.)
+
+
+[events]
+
+Catalysis	Theme:<EVENT>, Cause:Protein
+----------------------------------------
+DNA_methylation|GO:0006306	Theme:Protein, Site?:Entity
+DNA_demethylation|GO:0080111	Theme:Protein, Site?:Entity
+----------------------------------------
+Acetylation|GO:0006473	Theme:Protein, Site?:Entity, Contextgene?:Protein
+Methylation|GO:0006479	Theme:Protein, Site?:Entity, Contextgene?:Protein
+Glycosylation|GO:0006486	Theme:Protein, Site?:Entity, Sidechain?:Entity
+Hydroxylation|GO:0018126	Theme:Protein, Site?:Entity
+Phosphorylation|GO:0006468	Theme:Protein, Site?:Entity
+Ubiquitination|GO:0016567	Theme:Protein, Site?:Entity
+----------------------------------------
+Deacetylation|GO:0006476	Theme:Protein, Site?:Entity, Contextgene?:Protein
+Demethylation|GO:0006482	Theme:Protein, Site?:Entity, Contextgene?:Protein
+Deglycosylation|GO:0006517	Theme:Protein, Site?:Entity, Sidechain?:Entity
+Dehydroxylation|GO:-------	Theme:Protein, Site?:Entity
+Dephosphorylation|GO:0006470	Theme:Protein, Site?:Entity
+Deubiquitination|GO:0016579	Theme:Protein, Site?:Entity
+
+
+[attributes]
+
+Negation	Arg:<EVENT>
+Speculation	Arg:<EVENT>

Fichier diff supprimé car celui-ci est trop grand
+ 7 - 0
examples/product/articles.csv


+ 28 - 0
examples/product/bin/csv_to_iepy.py

@@ -0,0 +1,28 @@
+"""
+IEPY database loader from csv file
+
+Usage:
+    csv_to_iepy.py <filename>
+    csv_to_iepy.py -h | --help
+
+The <filename> argument can be a .csv file or a .csv.gz file containing the
+corpus in two columns: 'freebase_mid' and 'description'.
+
+Options:
+  -h --help             Show this screen
+  --version             Version number
+"""
+
+import logging
+
+from docopt import docopt
+
+import iepy
+iepy.setup(__file__)
+from iepy.utils import csv_to_iepy
+
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.INFO, format='%(message)s')
+    opts = docopt(__doc__, version=iepy.__version__)
+    filepath = opts["<filename>"]
+    csv_to_iepy(filepath)

+ 76 - 0
examples/product/bin/gazettes_loader.py

@@ -0,0 +1,76 @@
+"""
+IEPY gazettes loader
+
+Usage:
+    gazettes_loader.py <filename>
+
+
+The <filename> argument can be a .csv file or a .csv.gz file containing the
+gazettes in two columns: 'literal' and 'class'.
+
+
+Options:
+  -h --help             Show this screen
+"""
+
+import sys
+import csv
+import gzip
+import logging
+from operator import itemgetter
+
+from django.db import IntegrityError
+from docopt import docopt
+
+import iepy
+iepy.setup(__file__)
+from iepy.data.models import EntityKind, GazetteItem
+
+logging.basicConfig(level=logging.INFO, format='%(message)s')
+
+
+def add_gazettes_from_csv(filepath):
+    if filepath.endswith(".gz"):
+        fin = gzip.open(filepath, "rt")
+    else:
+        fin = open(filepath, "rt")
+    reader = csv.DictReader(fin)
+
+    expected_fnames = ['literal', 'class']
+    if not set(reader.fieldnames).issuperset(expected_fnames):
+        msg = "Couldn't find the expected field names on the provided csv: {}"
+        sys.exit(msg.format(expected_fnames))
+
+    _create_gazette_entries(
+        itemgetter(*expected_fnames)(line) for line in reader
+    )
+
+
+def _create_gazette_entries(entries_list):
+    kind_cache = {}
+    created = 0
+    for literal, kind_name in entries_list:
+        literal = literal.strip()
+        kind_name = kind_name.strip()
+        kind = kind_cache.get(kind_name)
+        if kind is None:
+            kind, _ = EntityKind.objects.get_or_create(name=kind_name)
+            kind_cache[kind_name] = kind
+        gazette = GazetteItem(text=literal, kind=kind)
+
+        try:
+            gazette.save()
+        except IntegrityError as error:
+            logging.warn(
+                "Gazette '{}' of class '{}' not loaded, literal already existed".format(
+                literal, kind_name))
+            print(error)
+        finally:
+            created += 1
+    print('Created {} new gazette items'.format(created))
+
+
+if __name__ == "__main__":
+    opts = docopt(__doc__, version=iepy.__version__)
+    fname = opts["<filename>"]
+    add_gazettes_from_csv(fname)

+ 59 - 0
examples/product/bin/iepy_rules_runner.py

@@ -0,0 +1,59 @@
+"""
+Run IEPY rule-based extractor
+
+Usage:
+    iepy_rules_runner.py
+    iepy_rules_runner.py -h | --help | --version
+
+Picks from rules.py the relation to work with, and the rules definitions and
+proceeds with the extraction.
+
+Options:
+  -h --help             Show this screen
+  --version             Version number
+"""
+import sys
+import logging
+
+from django.core.exceptions import ObjectDoesNotExist
+
+import iepy
+iepy.setup(__file__)
+
+from iepy.extraction.rules import load_rules
+from iepy.extraction.rules_core import RuleBasedCore
+from iepy.data import models, output
+from iepy.data.db import CandidateEvidenceManager
+
+
+def run_from_command_line():
+    logging.basicConfig(level=logging.INFO, format='%(message)s')
+
+    try:
+        relation_name = iepy.instance.rules.RELATION
+    except AttributeError:
+        logging.error("RELATION not defined in rules file")
+        sys.exit(1)
+
+    try:
+        relation = models.Relation.objects.get(name=relation_name)
+    except ObjectDoesNotExist:
+        logging.error("Relation {!r} not found".format(relation_name))
+        sys.exit(1)
+
+    # Load rules
+    rules = load_rules()
+
+    # Load evidences
+    evidences = CandidateEvidenceManager.candidates_for_relation(relation)
+
+    # Run the pipeline
+    iextractor = RuleBasedCore(relation, rules)
+    iextractor.start()
+    iextractor.process()
+    predictions = iextractor.predict(evidences)
+    output.dump_output_loop(predictions)
+
+
+if __name__ == u'__main__':
+    run_from_command_line()

+ 184 - 0
examples/product/bin/iepy_runner.py

@@ -0,0 +1,184 @@
+"""
+Run IEPY active-learning extractor
+
+Usage:
+    iepy_runner.py [options] <relation_name> <output>
+    iepy_runner.py [options] --db-store <relation_name>
+    iepy_runner.py -h | --help | --version
+
+Options:
+  --store-extractor=<extractor_output>     Stores the trained classifier
+  --trained-extractor=<extractor_path>     Load an already trained extractor
+  --db-store                               Stores the predictions on the database
+  --no-questions                           Won't generate questions to answer. Will predict
+                                           as is. Should be used with --trained-extractor
+  --tune-for=<tune-for>                    Predictions tuning. Options are high-prec
+                                           or high-recall [default: high-prec]
+  --extractor-config=<config.json>         Sets the extractor config
+  --version                                Version number
+  -h --help                                Show this screen
+"""
+
+import os
+import json
+import logging
+from docopt import docopt
+from sys import exit
+
+import iepy
+INSTANCE_PATH = iepy.setup(__file__)
+
+from iepy.extraction.active_learning_core import ActiveLearningCore, HIPREC, HIREC
+from iepy.data.db import CandidateEvidenceManager
+from iepy.data.models import Relation
+from iepy.extraction.terminal import TerminalAdministration
+from iepy.data import output
+
+
+def print_all_relations():
+    print("All available relations:")
+    for relation in Relation.objects.all():
+        print("  {}".format(relation))
+
+
+def load_labeled_evidences(relation, evidences):
+    CEM = CandidateEvidenceManager  # shorcut
+    return CEM.labels_for(relation, evidences, CEM.conflict_resolution_newest_wins)
+
+
+def _get_tuning_mode(opts):
+    if opts['--tune-for'] == 'high-prec':
+        tuning_mode = HIPREC
+    elif opts['--tune-for'] == 'high-recall':
+        tuning_mode = HIREC
+    else:
+        print ('Invalid tuning mode')
+        print (__doc__)
+        exit(1)
+    return tuning_mode
+
+
+def _get_relation(opts):
+    relation_name = opts['<relation_name>']
+    try:
+        relation = Relation.objects.get(name=relation_name)
+    except Relation.DoesNotExist:
+        print("Relation {!r} non existent".format(relation_name))
+        print_all_relations()
+        exit(1)
+    return relation
+
+
+def _load_extractor(opts, relation, labeled_evidences):
+    extractor_path = opts.get('--trained-extractor')
+    try:
+        iextractor = ActiveLearningCore.load(extractor_path,
+                                             labeled_evidences=labeled_evidences)
+    except ValueError:
+        print("Error: unable to load extractor, invalid file")
+        exit(1)
+
+    if iextractor.relation != relation:
+        print('The loaded extractor is not for the requested relation'
+              ' but for relation {} instead'.format(iextractor.relation))
+        exit(1)
+    print('Extractor successfully loaded')
+    return iextractor
+
+
+def _construct_extractor(opts, relation, labeled_evidences, tuning_mode):
+    config_filepath = opts.get("--extractor-config")
+    if not config_filepath:
+        config_filepath = os.path.join(INSTANCE_PATH, "extractor_config.json")
+
+    if not os.path.exists(config_filepath):
+        print("Error: extractor config does not exists, please create the "
+              "file extractor_config.json or use the --extractor-config")
+        exit(1)
+
+    with open(config_filepath) as filehandler:
+        try:
+            extractor_config = json.load(filehandler)
+        except Exception as error:
+            print("Error: unable to load extractor config: {}".format(error))
+            exit(1)
+
+    iextractor = ActiveLearningCore(
+        relation, labeled_evidences, extractor_config, tradeoff=tuning_mode
+    )
+    return iextractor
+
+
+def run_from_command_line():
+    opts = docopt(__doc__, version=iepy.__version__)
+
+    logging.basicConfig(level=logging.INFO, format='%(message)s')
+    logging.getLogger("featureforge").setLevel(logging.WARN)
+
+    tuning_mode = _get_tuning_mode(opts)
+    relation = _get_relation(opts)
+
+    candidates = CandidateEvidenceManager.candidates_for_relation(relation)
+    labeled_evidences = load_labeled_evidences(relation, candidates)
+
+    if opts.get('--trained-extractor'):
+        iextractor = _load_extractor(opts, relation, labeled_evidences)
+        was_ever_trained = True
+        opts["--no-questions"] = True
+    else:
+        iextractor = _construct_extractor(opts, relation, labeled_evidences, tuning_mode)
+        iextractor.start()
+        was_ever_trained = False
+
+    if not opts.get("--no-questions", False):
+        questions_loop(iextractor, relation, was_ever_trained)
+
+    # Candidates generator was consumed when generating labeled_evidences, so we'll
+    # define it fresh again
+    candidates = CandidateEvidenceManager.candidates_for_relation(relation)
+    # Predict and store output
+    predictions = iextractor.predict(candidates)  # asking predictions for EVERYTHING
+    if not predictions:
+        print("Nothing was predicted")
+        exit(1)
+
+    if opts.get("--db-store"):
+        output.dump_predictions_to_database(relation, predictions)
+
+    output_file = opts.get("<output>")
+    if output_file:
+        output.dump_runner_output_to_csv(predictions, output_file)
+
+    classifier_output = opts.get("--store-extractor")
+    if classifier_output:
+        iextractor.save(classifier_output)
+
+
+def questions_loop(iextractor, relation, was_ever_trained):
+    STOP = u'STOP'
+    term = TerminalAdministration(
+        relation,
+        extra_options=[(STOP, u'Stop execution')]
+    )
+    while iextractor.questions:
+        questions = list(iextractor.questions)  # copying the list
+        term.update_candidate_evidences_to_label(questions)
+        result = term()
+        i = 0
+        for c, label_value in load_labeled_evidences(relation, questions).items():
+            if label_value is not None:
+                iextractor.add_answer(c, label_value)
+                i += 1
+        print ('Added %s new human labels to the extractor core' % i)
+        iextractor.process()
+        was_ever_trained = True
+        if result == STOP:
+            break
+
+    if not was_ever_trained:
+        # It's needed to run some process before asking for predictions
+        iextractor.process()
+
+
+if __name__ == u'__main__':
+    run_from_command_line()

+ 12 - 0
examples/product/bin/manage.py

@@ -0,0 +1,12 @@
+#!/usr/bin/env python
+
+import sys
+
+from django.core.management import execute_from_command_line
+
+import iepy
+iepy.setup(__file__)
+
+
+if __name__ == "__main__":
+    execute_from_command_line(sys.argv)

+ 96 - 0
examples/product/bin/preprocess.py

@@ -0,0 +1,96 @@
+"""
+Corpus preprocessing script
+
+Usage:
+    preprocess.py [options]
+    preprocess.py --split-in=<num-splits> --run-part=<num-part>
+    preprocess.py --increment-ner
+    preprocess.py -h | --help | --version
+
+Options:
+  -h --help                      Show this screen
+  --multiple-cores=<num-cores>   Number of cores (use all to use every processor)
+  --increment-ner                Re run NER and Gazetter for every document. If a document lacked any of the previous steps, will be preprocessed entirely.
+  --version                      Version number
+"""
+import logging
+
+from docopt import docopt
+
+import os
+os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
+os.environ["CUDA_VISIBLE_DEVICES"] = ""
+
+import iepy
+import multiprocessing
+iepy.setup(__file__)
+from iepy.data.db import DocumentManager
+from iepy.selfpreprocess.self_preprocess import SelfPreprocesser
+from iepy.selfpreprocess.pipeline import PreProcessPipeline, PreProcessSteps
+# from iepy.preprocess.stanford_preprocess import StanfordPreprocess
+# from iepy.preprocess.pipeline import PreProcessPipeline, PreProcessSteps
+# from iepy.preprocess.segmenter import SyntacticSegmenterRunner
+
+
+
+
+class ParallelDocManager(DocumentManager):
+
+    def mines_of(self, qset, number_of_processors, my_id):
+        K = number_of_processors
+        N = my_id
+        clause = 'id %%%% %s = %s' % (K, N)
+        return qset.extra(where=[clause])
+
+def start_preprocess(docs, increment_ner):
+    pipeline = PreProcessPipeline([
+        SelfPreprocesser(increment_ner),
+        # SyntacticSegmenterRunner(increment=True)
+    ], docs)
+    pipeline.process_everything()
+
+if __name__ == '__main__':
+    logger = logging.getLogger(u'preprocess')
+    logger.setLevel(logging.INFO)
+    logging.basicConfig(level=logging.INFO, format='%(message)s')
+    opts = docopt(__doc__, version=iepy.__version__)
+    increment_ner = opts['--increment-ner']
+
+    dm = ParallelDocManager()
+    all_docs = dm.get_documents_lacking_preprocess(
+        [PreProcessSteps.brat])
+
+    multiple_cores = opts.get('--multiple-cores')
+    split_in = opts.get("--split-in")
+    run_part = opts.get("--run-part")
+
+    if multiple_cores:
+        if multiple_cores == "all":
+            multiple_cores = multiprocessing.cpu_count()
+        try:
+            multiple_cores = int(multiple_cores)
+        except ValueError:
+            logger.error("Invalid number of cores")
+            exit(1)
+
+        for i in range(multiple_cores):
+            process = multiprocessing.Process(
+                target=start_preprocess, args=(dm.mines_of(all_docs, multiple_cores, i), increment_ner)
+            )
+            process.start()
+    elif split_in:
+        try:
+            split_in = int(split_in)
+            run_part = int(run_part) - 1
+        except ValueError:
+            logger.error("Invalid split")
+            exit(1)
+
+        if run_part < 0 or run_part > split_in:
+            logger.error("Parts must be between 1 and {}".format(split_in))
+            exit(1)
+
+        docs = dm.mines_of(all_docs, split_in, run_part)
+        start_preprocess(docs, increment_ner)
+    else:
+        start_preprocess(all_docs, increment_ner)

+ 149 - 0
examples/product/bin/rules_verifier.py

@@ -0,0 +1,149 @@
+"""
+IEPY rules verifier
+
+
+Usage:
+    rules_verifier.py <relation> [options]
+
+Options:
+  --shuffle             Chooses the sample randomly and not the first ones
+  --create-evidences    Creates evidences that are missing [default: false]
+  -r --rule=<rule>      Tests only this rule
+  -l --limit=<limit>    Limits the amount of evidences uses
+  -h --help             Show this screen
+"""
+
+import sys
+import logging
+from docopt import docopt
+
+import refo
+from django.core.exceptions import ObjectDoesNotExist
+from colorama import init as colorama_init
+
+import iepy
+iepy.setup(__file__)
+
+from iepy.data import models
+from iepy.data.models import EvidenceCandidate
+from iepy.data.db import CandidateEvidenceManager
+from iepy.extraction.terminal import TerminalEvidenceFormatter
+from iepy.extraction.rules import (
+    load_rules, compile_rule, generate_tokens_to_match
+)
+from iepy.metrics import result_dict_from_predictions
+
+
+logging.basicConfig(level=logging.INFO, format='%(message)s')
+
+
+def run_from_command_line():
+    opts = docopt(__doc__, version=iepy.__version__)
+    relation_name = opts.get("<relation>")
+    limit = opts.get("--limit")
+    rule_name = opts.get("--rule")
+    shuffle = opts.get("--shuffle")
+    create_evidences = opts.get("--create-evidences")
+
+    if limit is None:
+        limit = -1
+
+    try:
+        limit = int(limit)
+    except ValueError:
+        logging.error("Invalid limit value, it must be a number")
+        sys.exit(1)
+
+    try:
+        relation = models.Relation.objects.get(name=relation_name)
+    except ObjectDoesNotExist:
+        logging.error("Relation {!r} not found".format(relation_name))
+        sys.exit(1)
+
+    # Load rules
+    rules = get_rules(rule_name)
+    rule_regexes = [
+        (rule.__name__, compile_rule(rule, relation), rule.answer) for rule in rules
+    ]
+
+    # Load evidences
+    if EvidenceCandidate.objects.all().count() == 0:
+        create_evidences = True
+    evidences = CandidateEvidenceManager.candidates_for_relation(
+        relation, create_evidences, seg_limit=limit, shuffle_segs=shuffle
+    )
+    conflict_solver = CandidateEvidenceManager.conflict_resolution_newest_wins
+    answers = CandidateEvidenceManager.labels_for(
+        relation, evidences, conflict_solver
+    )
+    run_tests(rule_regexes, evidences, answers)
+
+
+def run_tests(rule_regexes, evidences, answers):
+    predictions = []
+    real_labels = []
+    evidences_with_labels = []
+
+    colorama_init()
+    formatter = TerminalEvidenceFormatter()
+
+    for name, regex, answer in rule_regexes:
+        title = "Matches for rule '{}' (value: {})".format(name, answer)
+        print("\n{}\n{}".format(title, "-" * len(title)))
+
+        anything_matched = False
+        for evidence in evidences:
+            tokens_to_match = generate_tokens_to_match(evidence)
+            match = refo.match(regex, tokens_to_match)
+
+            if match:
+                anything_matched = True
+                print("  * {}".format(formatter.colored_text(evidence)))
+
+            if evidence in answers and answers[evidence] is not None:
+                evidences_with_labels.append(evidence)
+                real_labels.append(answers[evidence])
+
+                if match:
+                    predictions.append(answer)
+                else:
+                    predictions.append(False)
+
+        if not anything_matched:
+            print("  nothing matched")
+
+        print()
+
+    if real_labels:
+        results = result_dict_from_predictions(
+            evidences_with_labels, real_labels, predictions
+        )
+        results.pop("end_time")
+        keys = [
+            "true_positives", "true_negatives",
+            "false_positives", "false_negatives",
+            "precision", "recall",
+            "accuracy", "f1",
+        ]
+
+        title = "Metrics"
+        print("{}\n{}".format(title, "-" * len(title)))
+        for key in keys:
+            print("{:>15}: {:.2f}".format(key, results[key]))
+
+
+def get_rules(rule_name):
+    # Load rules
+    rules = load_rules()
+
+    if rule_name:
+        rules = [x for x in rules if x.__name__ == rule_name]
+        if not rules:
+            logging.error("rule '{}' does not exists".format(rule_name))
+            sys.exit(1)
+
+    return rules
+
+
+if __name__ == "__main__":
+    run_from_command_line()

+ 241 - 0
examples/product/bin/settlement.py

@@ -0,0 +1,241 @@
+
+
+
+
+from django.db.models import Q
+import datetime,time
+import iepy
+iepy.setup(__file__)
+from iepy.data.db import DocumentManager
+from iepy.data.models import IEDocument,LabeledIEDocument,IEDocumentMetadata,LabeledIEDocumentMetadata,Payroll
+from brat.models import BratAnnotation,LabeledBratAnnotation
+from django.db import transaction
+import pandas as pd
+from django.contrib.auth.models import User
+
+def object_to_dict(obj,class_model):
+    '''
+    :param obj:对象
+    :param class_model:django model
+    :return: 由对象生成的键值对
+    '''
+    _dict = {}
+    concrete_model = class_model._meta.concrete_model
+    for field in concrete_model._meta.local_fields:
+        value = field.value_from_object(obj)
+        _dict[field.name] = value
+    return _dict
+
+
+class Settlement():
+
+    '''
+    @summary: 结算类,定义了结算者所需要执行的各种方法
+    '''
+
+    def makePayroll(self,_user,time_begin,time_end):
+        '''
+        :param _user: 用户名
+        :param time_begin: 起始时间
+        :param time_end: 截至时间
+        :return:根据用户,时间段生成用户的标注情况
+        '''
+        from django.db import connection
+        with transaction.atomic():
+            cursor = connection.cursor()
+            sql = " select count(1) from corpus_iedocument where edituser='%s' and to_char(edittime,'yyyy-mm-dd')<='%s' and to_char(edittime,'yyyy-mm-dd')>='%s'"%(_user,time_end,time_begin)
+            cursor.execute(sql)
+            doc_count = cursor.fetchall()[0][0]
+            sql = " select count(1) from brat_bratannotation where document_id in (select human_identifier from corpus_iedocument where edituser='%s' and to_char(edittime,'yyyy-mm-dd')<='%s' and to_char(edittime,'yyyy-mm-dd')>='%s' order by edittime desc limit 1200) and value like '%s' "%(_user,time_end,time_begin,"T%")
+            cursor.execute(sql)
+            t_count = cursor.fetchall()[0][0]
+            sql = " select count(1) from brat_bratannotation where document_id in (select human_identifier from corpus_iedocument where edituser='%s' and to_char(edittime,'yyyy-mm-dd')<='%s' and to_char(edittime,'yyyy-mm-dd')>='%s' order by edittime desc limit 1200) and value like '%s' "%(_user,time_end,time_begin,"R%")
+            cursor.execute(sql)
+            r_count = cursor.fetchall()[0][0]
+            sql = " select count(1) from brat_bratannotation where document_id in (select human_identifier from corpus_iedocument where edituser='%s' and to_char(edittime,'yyyy-mm-dd')<='%s' and to_char(edittime,'yyyy-mm-dd')>='%s') and value like '%s' "%(_user,time_end,time_begin,"T%")
+            cursor.execute(sql)
+            all_t_count = cursor.fetchall()[0][0]
+            sql = " select count(1) from brat_bratannotation where document_id in (select human_identifier from corpus_iedocument where edituser='%s' and to_char(edittime,'yyyy-mm-dd')<='%s' and to_char(edittime,'yyyy-mm-dd')>='%s') and value like '%s' "%(_user,time_end,time_begin,"R%")
+            cursor.execute(sql)
+            all_r_count = cursor.fetchall()[0][0]
+            wage = round(0.03*t_count+0.05*r_count+(all_t_count-t_count)*0.04+(all_r_count-r_count)*0.06,2)
+            print(doc_count,t_count,r_count,wage)
+            payrolls = Payroll.objects.filter(Q(user=_user)& Q(begin_time=time_begin) & Q(end_time=time_end))
+            if len(payrolls)==0:
+                _payroll = Payroll.objects.create(**{"user":_user,"doc_count":doc_count,"begin_time":time_begin,"end_time":time_end,"t_count":all_t_count,"r_count":all_r_count,"wage":wage})
+                _payroll.save()
+            else:
+                _payroll = payrolls[0]
+                _payroll.doc_count = doc_count
+                _payroll.t_count = all_t_count
+                _payroll.r_count = all_r_count
+                _payroll.wage = wage
+                _payroll.save()
+
+    def exportPayroll(self,begin_time,end_time):
+        '''
+        :param begin_time: 导出开始时间
+        :param end_time: 导出结束时间
+        :return:
+        '''
+        list_user = []
+        list_doc_count = []
+        list_t_count = []
+        list_r_count = []
+        list_wage = []
+        list_yield = []
+        list_account = []
+        list_begin = []
+        list_end = []
+        if begin_time is not None:
+            payrolls = Payroll.objects.filter(Q(begin_time=begin_time) & Q(end_time=end_time))
+        else:
+            payrolls = Payroll.objects.filter(Q(end_time=end_time))
+        for _payroll in payrolls:
+            list_user.append(_payroll.user)
+            list_doc_count.append(_payroll.doc_count)
+            list_t_count.append(_payroll.t_count)
+            list_r_count.append(_payroll.r_count)
+            list_wage.append(_payroll.wage)
+            list_yield.append(_payroll._yield)
+            list_account.append(_payroll.account)
+            list_begin.append(_payroll.begin_time)
+            list_end.append(_payroll.end_time)
+        df = pd.DataFrame({"用户":list_user,"开始时间":list_begin,"结束时间":list_end,"文章数":list_doc_count,"要素数":list_t_count,"关系数":list_r_count,"总价":list_wage,"合格率":list_yield,"结算价":list_account})
+        df.to_excel("%s-%s要素标注统计.xls"%(begin_time,end_time),columns=["用户","开始时间","结束时间","文章数","要素数","关系数","总价","合格率","结算价"])
+
+    def getAllUser(self):
+        from django.db import connection
+        with transaction.atomic():
+            list_user = []
+            cursor = connection.cursor()
+            sql = "select username from auth_user where is_staff='t'"
+            cursor.execute(sql)
+            for row in cursor.fetchall():
+                list_user.append(row[0])
+            return list_user
+
+
+    def makeMigrate(self,_user,time_begin,time_end):
+        '''
+        :param _user: 用户名
+        :param time_begin: 起始时间
+        :param time_end: 截至时间
+        :return: 将用户在时间段内的数据迁移到标准表中
+        '''
+        pass
+        # from django.db import connection
+        # with transaction.atomic():
+        #     cursor = connection.cursor()
+        #     sql = " select human_identifier,offsets_to_text,sentences from corpus_iedocument where edituser is null"
+        #     cursor.execute(sql)
+        #     cursor1 = connection.cursor()
+        #     _index = 0
+        #     rows = True
+        #     while(rows):
+        #         rows=cursor.fetchmany(1000)
+        #         for row in rows:
+        #             _index += 1
+        #             print(_index)
+        #             human_identifier,offsets_to_text,sentences = row
+        #             if sentences!="[]":
+        #                 _off = offsets_to_text.split(", ")[-1][:-1]
+        #                 _sen = sentences.split(", ")[-1][:-1]
+        #                 print(_off,_sen)
+        #                 if int(_off)!=int(_sen):
+        #                     offsets_to_text = offsets_to_text[:-1]+", "+str(int(_sen))+"]"
+        #                     print(offsets_to_text)
+        #                     cursor1.execute("update corpus_iedocument set offsets_to_text='%s' where human_identifier='%s'"%(offsets_to_text,human_identifier))
+
+
+
+            # ieDocuments = IEDocument.objects.filter(Q(edituser=_user) & Q(edittime__range=(time_begin,time_end)))
+            # for obj in ieDocuments:
+            #     _dict = object_to_dict(obj,IEDocument)
+            #     _dict_meta = object_to_dict(obj.metadata,IEDocumentMetadata)
+            #     labeledMeta = LabeledIEDocumentMetadata.objects.create(**_dict_meta)
+            #     labeledMeta.save()
+            #     _dict["metadata"] = labeledMeta
+            #     tmp = LabeledIEDocument.objects.create(**_dict)
+            #     tmp.save()
+            #
+            #     bratAnnotations = BratAnnotation.objects.filter(Q(document_id=obj.human_identifier))
+            #     for ann in bratAnnotations:
+            #         _dict_ann = object_to_dict(ann,BratAnnotation)
+            #         labeledAnn = LabeledBratAnnotation.objects.create(**_dict_ann)
+            #         labeledAnn.save()
+
+
+    def getPercentOfPass(self,_user,time_begin,time_end):
+        '''
+        :param _user:用户名
+        :param time_begin: 起始时间
+        :param time_end: 截至时间
+        :return: 获得用户在时间段内标注数据的合格率
+        '''
+
+    def makePayrolls(self,time_begin,time_end):
+        '''
+        :param time_begin:起始时间
+        :param time_end: 截至时间
+        :return: 获得所有用户的工资表
+        '''
+        for _user in self.getAllUser():
+            self.makePayroll(_user,time_begin,time_end)
+        self.exportPayroll(time_begin,time_end)
+
+    def createUser_batch(self,batch_size=90):
+        '''
+        :param batch_size: 用户个数
+        :return:
+        '''
+        list_user = [User.objects.create_user(username="bidi%d"%(i+1),password="bidi%d"%(i+1)) for i in range(batch_size)]
+
+    def exportLabels(self):
+        groups = [[1,7],[8,14],[15,22],[23,29],[30,36],[37,43],[44,50],[51,56],[57,62],[63,71]]
+        from django.db import connection
+        cursor = connection.cursor()
+        for _i in range(len(groups)):
+            _begin,_end = groups[_i]
+            list_username = []
+            list_user = []
+            list_label = []
+            list_time = []
+            for _j in range(_begin,_end+1):
+                username = "bidi%d"%_j
+                list_username.append("'%s'"%username)
+            sql = " select edituser,human_identifier,to_char(edittime,'yyyy-mm-dd') from corpus_iedocument where edituser in(%s) order by edittime asc"%(",".join(list_username))
+            print(sql)
+            cursor.execute(sql)
+            rows = cursor.fetchall()
+            for row in rows:
+                list_user.append(row[0])
+                list_label.append(row[1])
+                list_time.append(row[2])
+            df = pd.DataFrame({"时间":list_time,"用户":list_user,"文章编号":list_label})
+            df.to_excel("分组_%d.xls"%(_i+1),columns=["时间","用户","文章编号"])
+
+    def filter(self):
+        '''
+        过滤拍卖公告
+        :return:
+        '''
+        import re
+        ieDocuments = IEDocument.objects.all()
+        for obj in ieDocuments:
+            if re.search("拍卖",obj.text) is not None:
+                obj.jump_signal = 1
+                obj.save()
+                print(obj.human_identifier)
+
+
+
+if __name__=="__main__":
+    settle = Settlement()
+    # settle.makeMigrate("test","2020-08-01","2020-08-31")
+    # settle.makePayroll("test17","2020-08-01","2020-10-31")
+    # settle.makePayrolls("2020-08-01","2020-08-31")
+    settle.exportPayroll(begin_time=None,end_time='2020-10-31')
+    # settle.createUser_batch(batch_size=102)
+    # settle.exportLabels()
+    # settle.filter()

+ 20 - 0
examples/product/extractor_config.json

@@ -0,0 +1,20 @@
+{
+    "sparse_features": [
+        "bag_of_words",
+        "bag_of_pos",
+        "bag_of_words_in_between",
+        "bag_of_pos_in_between"
+    ],
+    "dense_features": [
+        "entity_order",
+        "entity_distance",
+        "other_entities_in_between",
+        "verbs_count_in_between",
+        "verbs_count",
+        "total_number_of_entities",
+        "symbols_in_between",
+        "number_of_tokens"
+    ],
+    "classifier_args": {},
+    "classifier": "svc"
+}

+ 6 - 0
examples/product/format.py

@@ -0,0 +1,6 @@
+import time
+a = [1462636800, 1606492800]
+print(time.time()-86400*4)
+
+for item in a:
+    print(time.strftime('%Y-%m-%d',time.localtime(1606377029)))

Fichier diff supprimé car celui-ci est trop grand
+ 188 - 0
examples/product/product_article.csv


+ 2 - 0
examples/product/rules.py

@@ -0,0 +1,2 @@
+# Write here your rules
+# RELATION = 'your relation here'

+ 182 - 0
examples/product/settings.py

@@ -0,0 +1,182 @@
+"""
+For more information on this file, see
+https://docs.djangoproject.com/en/1.7/topics/settings/
+
+For the full list of settings and their values, see
+https://docs.djangoproject.com/en/1.7/ref/settings/
+"""
+
+from iepy.webui.webui.settings import *
+from django.conf import settings
+
+IEPY_VERSION = '0.9.6'
+IEPY_LANG = 'en'
+SECRET_KEY = 'u==!fueit=wxo&j8!5u+sfasp4prjluk@*s=7!-wz_&r@pn))r'
+DEBUG = True
+TEMPLATE_DEBUG = True
+
+# Database
+# https://docs.djangoproject.com/en/1.7/ref/settings/#databases
+# DATABASES = {
+#     'default': {
+#         'ENGINE': 'django.db.backends.sqlite3',
+#         'NAME': '/home/python/luojiehua/dl_nlp/iepy-develop/examples/test/test.sqlite',
+#     }
+# }
+DATABASES = {
+    'default': {
+        'ENGINE': 'django.db.backends.postgresql_psycopg2',
+        'NAME': 'iepy_product',
+        'USER': 'postgres',
+        'PASSWORD': 'postgres',
+        'HOST': '192.168.2.101',
+        'PORT': '5432'
+    }
+}
+
+# For changing tokenization options, read here.
+# http://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/process/PTBTokenizer.html
+# You can use as key any of the "known options" listed on that page, and as value,
+# use True or False (python names) for booleans, or strings when option requires a text
+# CORENLP_TKN_OPTS = {
+#     'latexQuotes': False
+# }
+
+#default brat settings
+CONFIG_BRAT = {
+    "acl.conf":"""
+User-agent: *
+Allow: /
+Disallow: /hidden/
+
+User-agent: guest
+Disallow: /confidential/
+""",
+    "annotation.conf":"""
+[spans]
+product
+code
+name
+money
+	money_tendereeMoney
+	money_tendererMoney
+org
+	org_tenderee
+	org_agency
+	org_tenderer
+	org_secondTenderer
+	org_thirdTenderer
+company
+	company_tenderee
+	company_agency
+	company_tenderer
+	company_secondTenderer
+	company_thirdTenderer
+job
+person
+	person_tendereePerson
+	person_agencyPerson
+	person_person
+	person_review
+time
+	time_release
+	time_bidopen
+	time_bidclose
+location
+package
+phone
+moneysource
+bidway
+serviceTime
+[relations]
+Equiv	Arg1:org|company|org_tenderee|org_agency|org_tenderer|org_secondTenderer|org_thirdTenderer|company_tenderee|company_agency|company_tenderer|company_secondTenderer|company_thirdTenderer, Arg2:org|company|org_tenderee|org_agency|org_tenderer|org_secondTenderer|org_thirdTenderer|company_tenderee|company_agency|company_tenderer|company_secondTenderer|company_thirdTenderer, <REL-TYPE>:symmetric-transitive
+rel_tendererMoney	Arg1:org_tenderer|org_secondTenderer|org_thirdTenderer|company_tenderer|company_secondTenderer|company_thirdTenderer|org|company, Arg2:money_tendererMoney
+rel_tendereeMoney	Arg1:package, Arg2:money_tendereeMoney|money
+rel_person	Arg1:org|company|org_tenderee|org_agency|org_tenderer|org_secondTenderer|org_thirdTenderer|company_tenderee|company_agency|company_tenderer|company_secondTenderer|company_thirdTenderer, Arg2:person_tendereePerson|person_agencyPerson|person_person
+rel_pack	Arg1:org_tenderer|org_secondTenderer|org_thirdTenderer|company_tenderer|company_secondTenderer|company_thirdTenderer, Arg2:package
+rel_address	Arg1:org|company|org_tenderee|org_agency|org_tenderer|org_secondTenderer|org_thirdTenderer|company_tenderee|company_agency|company_tenderer|company_secondTenderer|company_thirdTenderer, Arg2:location
+rel_phone	Arg1:person_tendereePerson|person_agencyPerson|person_person, Arg2:phone
+rel_pack_code	Arg1:package, Arg2:code
+rel_pack_name	Arg1:package, Arg2:name
+
+
+[events]
+#Protein_binding|GO:0005515	Theme+:Protein
+#Gene_expression|GO:0010467	Theme:Protein
+
+[attributes]
+#att_role	Arg:<ENTITY>, Value:招标人|代理人|中标人|第二候选人|第三候选人|att_noRole
+#att_role	Arg:<ENTITY>, Value:att_tenderee|att_agency|att_tenderer|att_secondTenderer|att_thirdTenderer|att_noRole
+#att_money	Arg:<ENTITY>, Value:att_tendereeMoney|att_tendererMoney|att_nomoney
+#att_person	Arg:<ENTITY>, Value:att_noperson|att_tendereePerson|att_agencyPerson|att_person
+#Negation	Arg:<EVENT>
+#Speculation	Arg:<EVENT>
+""",
+    "visual.conf":"""
+[labels]
+code | 项目编号
+name | 项目名称
+org | 组织
+company | 公司
+job | 职业
+person | 人名
+time | 时间
+location | 地址
+package | 包号
+phone | 电话
+money | 金额
+money_tendereeMoney | 招标金额
+money_tendererMoney | 中投标金额
+
+org_tenderee | 招标人
+org_agency | 代理人
+org_tenderer | 中标人
+org_secondTenderer | 第二候选人
+org_thirdTenderer | 第三候选人
+company_tenderee | 招标人
+company_agency | 代理人
+company_tenderer | 中标人
+company_secondTenderer | 第二候选人
+company_thirdTenderer | 第三候选人
+
+person_tendereePerson | 招标联系人
+person_agencyPerson | 代理联系人
+person_person | 联系人
+
+rel_tendererMoney | 中投标金额
+rel_tendereeMoney | 招标金额
+rel_person | 联系人
+rel_pack | 所属包
+rel_address | 地址
+rel_phone | 联系电话
+rel_pack_code | 包件编号
+rel_pack_name | 包件名称
+
+person_review | 评审专家
+time_release | 发布时间
+time_bidopen | 开标时间
+time_bidclose | 截标时间
+moneysource | 资金来源
+bidway | 招标方式
+serviceTime | 服务期限
+product | 产品
+
+#Protein | Protein | Pro | P
+#Protein_binding | Protein binding | Binding | Bind
+#Gene_expression | Gene expression | Expression | Exp
+#Theme | Theme | Th
+
+[drawing]
+Protein	bgColor:#7fa2ff
+SPAN_DEFAULT	fgColor:black, bgColor:lightgreen, borderColor:black
+ARC_DEFAULT	color:black
+ATTRIBUTE_DEFAULT	glyph:*
+""",
+    "tools.conf":"""
+[search]
+google     <URL>:http://www.google.com/search?q=%s
+""",
+    "kb_shortcuts.conf":"""
+P	Protein
+"""
+}

BIN
examples/product/test.sqlite


+ 1 - 0
examples/test/__init__.py

@@ -0,0 +1 @@
+from . import rules

+ 49 - 0
examples/test/annotation.conf

@@ -0,0 +1,49 @@
+# -*- Mode: Text; tab-width: 8; indent-tabs-mode: nil; coding: utf-8; -*-
+# vim:set ft=conf ts=2 sw=2 sts=2 autoindent:
+
+# Simple text-based definitions of entity, relation and event types
+# and event attributes for the BioNLP Shared Task 2011 EPI task.
+
+
+[entities]
+
+Protein
+	abc
+Entity
+
+
+[relations]
+
+Equiv	Arg1:Protein, Arg2:Protein, <REL-TYPE>:symmetric-transitive
+Equiv	Arg1:Entity, Arg2:Entity, <REL-TYPE>:symmetric-transitive
+
+# (No entity nestings permitted in EPI. Could be defined using special
+# relation type ENTITY-NESTING if necessary.)
+
+
+[events]
+
+Catalysis	Theme:<EVENT>, Cause:Protein
+----------------------------------------
+DNA_methylation|GO:0006306	Theme:Protein, Site?:Entity
+DNA_demethylation|GO:0080111	Theme:Protein, Site?:Entity
+----------------------------------------
+Acetylation|GO:0006473	Theme:Protein, Site?:Entity, Contextgene?:Protein
+Methylation|GO:0006479	Theme:Protein, Site?:Entity, Contextgene?:Protein
+Glycosylation|GO:0006486	Theme:Protein, Site?:Entity, Sidechain?:Entity
+Hydroxylation|GO:0018126	Theme:Protein, Site?:Entity
+Phosphorylation|GO:0006468	Theme:Protein, Site?:Entity
+Ubiquitination|GO:0016567	Theme:Protein, Site?:Entity
+----------------------------------------
+Deacetylation|GO:0006476	Theme:Protein, Site?:Entity, Contextgene?:Protein
+Demethylation|GO:0006482	Theme:Protein, Site?:Entity, Contextgene?:Protein
+Deglycosylation|GO:0006517	Theme:Protein, Site?:Entity, Sidechain?:Entity
+Dehydroxylation|GO:-------	Theme:Protein, Site?:Entity
+Dephosphorylation|GO:0006470	Theme:Protein, Site?:Entity
+Deubiquitination|GO:0016579	Theme:Protein, Site?:Entity
+
+
+[attributes]
+
+Negation	Arg:<EVENT>
+Speculation	Arg:<EVENT>

Fichier diff supprimé car celui-ci est trop grand
+ 7 - 0
examples/test/articles.csv


BIN
examples/test/bin/2020-08-01-2020-08-31要素标注统计.xls


BIN
examples/test/bin/None-2020-09-25要素标注统计.xls


BIN
examples/test/bin/None-2020-10-31要素标注统计.xls


BIN
examples/test/bin/None-2020-11-25要素标注统计.xls


BIN
examples/test/bin/None-2020-12-25要素标注统计.xls


+ 28 - 0
examples/test/bin/csv_to_iepy.py

@@ -0,0 +1,28 @@
+"""
+IEPY database loader from csv file
+
+Usage:
+    csv_to_iepy.py <filename>
+    csv_to_iepy.py -h | --help
+
+The <filename> argument can be a .csv file or a .csv.gz file containing the
+corpus in two columns: 'freebase_mid' and 'description'.
+
+Options:
+  -h --help             Show this screen
+  --version             Version number
+"""
+
+import logging
+
+from docopt import docopt
+
+import iepy
+iepy.setup(__file__)
+from iepy.utils import csv_to_iepy
+
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.INFO, format='%(message)s')
+    opts = docopt(__doc__, version=iepy.__version__)
+    filepath = opts["<filename>"]
+    csv_to_iepy(filepath)

+ 76 - 0
examples/test/bin/gazettes_loader.py

@@ -0,0 +1,76 @@
+"""
+IEPY gazettes loader
+
+Usage:
+    gazettes_loader.py <filename>
+
+
+The <filename> argument can be a .csv file or a .csv.gz file containing the
+gazettes in two columns: 'literal' and 'class'.
+
+
+Options:
+  -h --help             Show this screen
+"""
+
+import sys
+import csv
+import gzip
+import logging
+from operator import itemgetter
+
+from django.db import IntegrityError
+from docopt import docopt
+
+import iepy
+iepy.setup(__file__)
+from iepy.data.models import EntityKind, GazetteItem
+
+logging.basicConfig(level=logging.INFO, format='%(message)s')
+
+
+def add_gazettes_from_csv(filepath):
+    if filepath.endswith(".gz"):
+        fin = gzip.open(filepath, "rt")
+    else:
+        fin = open(filepath, "rt")
+    reader = csv.DictReader(fin)
+
+    expected_fnames = ['literal', 'class']
+    if not set(reader.fieldnames).issuperset(expected_fnames):
+        msg = "Couldn't find the expected field names on the provided csv: {}"
+        sys.exit(msg.format(expected_fnames))
+
+    _create_gazette_entries(
+        itemgetter(*expected_fnames)(line) for line in reader
+    )
+
+
+def _create_gazette_entries(entries_list):
+    kind_cache = {}
+    created = 0
+    for literal, kind_name in entries_list:
+        literal = literal.strip()
+        kind_name = kind_name.strip()
+        kind = kind_cache.get(kind_name)
+        if kind is None:
+            kind, _ = EntityKind.objects.get_or_create(name=kind_name)
+            kind_cache[kind_name] = kind
+        gazette = GazetteItem(text=literal, kind=kind)
+
+        try:
+            gazette.save()
+        except IntegrityError as error:
+            logging.warn(
+                "Gazette '{}' of class '{}' not loaded, literal already existed".format(
+                literal, kind_name))
+            print(error)
+        finally:
+            created += 1
+    print('Created {} new gazette items'.format(created))
+
+
+if __name__ == "__main__":
+    opts = docopt(__doc__, version=iepy.__version__)
+    fname = opts["<filename>"]
+    add_gazettes_from_csv(fname)

+ 59 - 0
examples/test/bin/iepy_rules_runner.py

@@ -0,0 +1,59 @@
+"""
+Run IEPY rule-based extractor
+
+Usage:
+    iepy_rules_runner.py
+    iepy_rules_runner.py -h | --help | --version
+
+Picks from rules.py the relation to work with, and the rules definitions and
+proceeds with the extraction.
+
+Options:
+  -h --help             Show this screen
+  --version             Version number
+"""
+import sys
+import logging
+
+from django.core.exceptions import ObjectDoesNotExist
+
+import iepy
+iepy.setup(__file__)
+
+from iepy.extraction.rules import load_rules
+from iepy.extraction.rules_core import RuleBasedCore
+from iepy.data import models, output
+from iepy.data.db import CandidateEvidenceManager
+
+
+def run_from_command_line():
+    logging.basicConfig(level=logging.INFO, format='%(message)s')
+
+    try:
+        relation_name = iepy.instance.rules.RELATION
+    except AttributeError:
+        logging.error("RELATION not defined in rules file")
+        sys.exit(1)
+
+    try:
+        relation = models.Relation.objects.get(name=relation_name)
+    except ObjectDoesNotExist:
+        logging.error("Relation {!r} not found".format(relation_name))
+        sys.exit(1)
+
+    # Load rules
+    rules = load_rules()
+
+    # Load evidences
+    evidences = CandidateEvidenceManager.candidates_for_relation(relation)
+
+    # Run the pipeline
+    iextractor = RuleBasedCore(relation, rules)
+    iextractor.start()
+    iextractor.process()
+    predictions = iextractor.predict(evidences)
+    output.dump_output_loop(predictions)
+
+
+if __name__ == u'__main__':
+    run_from_command_line()

+ 184 - 0
examples/test/bin/iepy_runner.py

@@ -0,0 +1,184 @@
+"""
+Run IEPY active-learning extractor
+
+Usage:
+    iepy_runner.py [options] <relation_name> <output>
+    iepy_runner.py [options] --db-store <relation_name>
+    iepy_runner.py -h | --help | --version
+
+Options:
+  --store-extractor=<extractor_output>     Stores the trained classifier
+  --trained-extractor=<extractor_path>     Load an already trained extractor
+  --db-store                               Stores the predictions on the database
+  --no-questions                           Won't generate questions to answer. Will predict
+                                           as is. Should be used with --trained-extractor
+  --tune-for=<tune-for>                    Predictions tuning. Options are high-prec
+                                           or high-recall [default: high-prec]
+  --extractor-config=<config.json>         Sets the extractor config
+  --version                                Version number
+  -h --help                                Show this screen
+"""
+
+import os
+import json
+import logging
+from docopt import docopt
+from sys import exit
+
+import iepy
+INSTANCE_PATH = iepy.setup(__file__)
+
+from iepy.extraction.active_learning_core import ActiveLearningCore, HIPREC, HIREC
+from iepy.data.db import CandidateEvidenceManager
+from iepy.data.models import Relation
+from iepy.extraction.terminal import TerminalAdministration
+from iepy.data import output
+
+
+def print_all_relations():
+    print("All available relations:")
+    for relation in Relation.objects.all():
+        print("  {}".format(relation))
+
+
+def load_labeled_evidences(relation, evidences):
+    CEM = CandidateEvidenceManager  # shorcut
+    return CEM.labels_for(relation, evidences, CEM.conflict_resolution_newest_wins)
+
+
+def _get_tuning_mode(opts):
+    if opts['--tune-for'] == 'high-prec':
+        tuning_mode = HIPREC
+    elif opts['--tune-for'] == 'high-recall':
+        tuning_mode = HIREC
+    else:
+        print ('Invalid tuning mode')
+        print (__doc__)
+        exit(1)
+    return tuning_mode
+
+
+def _get_relation(opts):
+    relation_name = opts['<relation_name>']
+    try:
+        relation = Relation.objects.get(name=relation_name)
+    except Relation.DoesNotExist:
+        print("Relation {!r} non existent".format(relation_name))
+        print_all_relations()
+        exit(1)
+    return relation
+
+
+def _load_extractor(opts, relation, labeled_evidences):
+    extractor_path = opts.get('--trained-extractor')
+    try:
+        iextractor = ActiveLearningCore.load(extractor_path,
+                                             labeled_evidences=labeled_evidences)
+    except ValueError:
+        print("Error: unable to load extractor, invalid file")
+        exit(1)
+
+    if iextractor.relation != relation:
+        print('The loaded extractor is not for the requested relation'
+              ' but for relation {} instead'.format(iextractor.relation))
+        exit(1)
+    print('Extractor successfully loaded')
+    return iextractor
+
+
+def _construct_extractor(opts, relation, labeled_evidences, tuning_mode):
+    config_filepath = opts.get("--extractor-config")
+    if not config_filepath:
+        config_filepath = os.path.join(INSTANCE_PATH, "extractor_config.json")
+
+    if not os.path.exists(config_filepath):
+        print("Error: extractor config does not exists, please create the "
+              "file extractor_config.json or use the --extractor-config")
+        exit(1)
+
+    with open(config_filepath) as filehandler:
+        try:
+            extractor_config = json.load(filehandler)
+        except Exception as error:
+            print("Error: unable to load extractor config: {}".format(error))
+            exit(1)
+
+    iextractor = ActiveLearningCore(
+        relation, labeled_evidences, extractor_config, tradeoff=tuning_mode
+    )
+    return iextractor
+
+
+def run_from_command_line():
+    opts = docopt(__doc__, version=iepy.__version__)
+
+    logging.basicConfig(level=logging.INFO, format='%(message)s')
+    logging.getLogger("featureforge").setLevel(logging.WARN)
+
+    tuning_mode = _get_tuning_mode(opts)
+    relation = _get_relation(opts)
+
+    candidates = CandidateEvidenceManager.candidates_for_relation(relation)
+    labeled_evidences = load_labeled_evidences(relation, candidates)
+
+    if opts.get('--trained-extractor'):
+        iextractor = _load_extractor(opts, relation, labeled_evidences)
+        was_ever_trained = True
+        opts["--no-questions"] = True
+    else:
+        iextractor = _construct_extractor(opts, relation, labeled_evidences, tuning_mode)
+        iextractor.start()
+        was_ever_trained = False
+
+    if not opts.get("--no-questions", False):
+        questions_loop(iextractor, relation, was_ever_trained)
+
+    # Candidates generator was consumed when generating labeled_evidences, so we'll
+    # define it fresh again
+    candidates = CandidateEvidenceManager.candidates_for_relation(relation)
+    # Predict and store output
+    predictions = iextractor.predict(candidates)  # asking predictions for EVERYTHING
+    if not predictions:
+        print("Nothing was predicted")
+        exit(1)
+
+    if opts.get("--db-store"):
+        output.dump_predictions_to_database(relation, predictions)
+
+    output_file = opts.get("<output>")
+    if output_file:
+        output.dump_runner_output_to_csv(predictions, output_file)
+
+    classifier_output = opts.get("--store-extractor")
+    if classifier_output:
+        iextractor.save(classifier_output)
+
+
+def questions_loop(iextractor, relation, was_ever_trained):
+    STOP = u'STOP'
+    term = TerminalAdministration(
+        relation,
+        extra_options=[(STOP, u'Stop execution')]
+    )
+    while iextractor.questions:
+        questions = list(iextractor.questions)  # copying the list
+        term.update_candidate_evidences_to_label(questions)
+        result = term()
+        i = 0
+        for c, label_value in load_labeled_evidences(relation, questions).items():
+            if label_value is not None:
+                iextractor.add_answer(c, label_value)
+                i += 1
+        print ('Added %s new human labels to the extractor core' % i)
+        iextractor.process()
+        was_ever_trained = True
+        if result == STOP:
+            break
+
+    if not was_ever_trained:
+        # It's needed to run some process before asking for predictions
+        iextractor.process()
+
+
+if __name__ == u'__main__':
+    run_from_command_line()

+ 12 - 0
examples/test/bin/manage.py

@@ -0,0 +1,12 @@
+#!/usr/bin/env python
+
+import sys
+
+from django.core.management import execute_from_command_line
+
+import iepy
+iepy.setup(__file__)
+
+
+if __name__ == "__main__":
+    execute_from_command_line(sys.argv)

+ 96 - 0
examples/test/bin/preprocess.py

@@ -0,0 +1,96 @@
+"""
+Corpus preprocessing script
+
+Usage:
+    preprocess.py [options]
+    preprocess.py --split-in=<num-splits> --run-part=<num-part>
+    preprocess.py --increment-ner
+    preprocess.py -h | --help | --version
+
+Options:
+  -h --help                      Show this screen
+  --multiple-cores=<num-cores>   Number of cores (use all to use every processor)
+  --increment-ner                Re run NER and Gazetter for every document. If a document lacked any of the previous steps, will be preprocessed entirely.
+  --version                      Version number
+"""
+import logging
+
+from docopt import docopt
+
+import os
+os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
+os.environ["CUDA_VISIBLE_DEVICES"] = ""
+
+import iepy
+import multiprocessing
+iepy.setup(__file__)
+from iepy.data.db import DocumentManager
+from iepy.selfpreprocess.self_preprocess import SelfPreprocesser
+from iepy.selfpreprocess.pipeline import PreProcessPipeline, PreProcessSteps
+# from iepy.preprocess.stanford_preprocess import StanfordPreprocess
+# from iepy.preprocess.pipeline import PreProcessPipeline, PreProcessSteps
+# from iepy.preprocess.segmenter import SyntacticSegmenterRunner
+
+
+
+
+class ParallelDocManager(DocumentManager):
+
+    def mines_of(self, qset, number_of_processors, my_id):
+        K = number_of_processors
+        N = my_id
+        clause = 'id %%%% %s = %s' % (K, N)
+        return qset.extra(where=[clause])
+
+def start_preprocess(docs, increment_ner):
+    pipeline = PreProcessPipeline([
+        SelfPreprocesser(increment_ner),
+        # SyntacticSegmenterRunner(increment=True)
+    ], docs)
+    pipeline.process_everything()
+
+if __name__ == '__main__':
+    logger = logging.getLogger(u'preprocess')
+    logger.setLevel(logging.INFO)
+    logging.basicConfig(level=logging.INFO, format='%(message)s')
+    opts = docopt(__doc__, version=iepy.__version__)
+    increment_ner = opts['--increment-ner']
+
+    dm = ParallelDocManager()
+    all_docs = dm.get_documents_lacking_preprocess(
+        [PreProcessSteps.brat])
+
+    multiple_cores = opts.get('--multiple-cores')
+    split_in = opts.get("--split-in")
+    run_part = opts.get("--run-part")
+
+    if multiple_cores:
+        if multiple_cores == "all":
+            multiple_cores = multiprocessing.cpu_count()
+        try:
+            multiple_cores = int(multiple_cores)
+        except ValueError:
+            logger.error("Invalid number of cores")
+            exit(1)
+
+        for i in range(multiple_cores):
+            process = multiprocessing.Process(
+                target=start_preprocess, args=(dm.mines_of(all_docs, multiple_cores, i), increment_ner)
+            )
+            process.start()
+    elif split_in:
+        try:
+            split_in = int(split_in)
+            run_part = int(run_part) - 1
+        except ValueError:
+            logger.error("Invalid split")
+            exit(1)
+
+        if run_part < 0 or run_part > split_in:
+            logger.error("Parts must be between 1 and {}".format(split_in))
+            exit(1)
+
+        docs = dm.mines_of(all_docs, split_in, run_part)
+        start_preprocess(docs, increment_ner)
+    else:
+        start_preprocess(all_docs, increment_ner)

+ 149 - 0
examples/test/bin/rules_verifier.py

@@ -0,0 +1,149 @@
+"""
+IEPY rules verifier
+
+
+Usage:
+    rules_verifier.py <relation> [options]
+
+Options:
+  --shuffle             Chooses the sample randomly and not the first ones
+  --create-evidences    Creates evidences that are missing [default: false]
+  -r --rule=<rule>      Tests only this rule
+  -l --limit=<limit>    Limits the amount of evidences uses
+  -h --help             Show this screen
+"""
+
+import sys
+import logging
+from docopt import docopt
+
+import refo
+from django.core.exceptions import ObjectDoesNotExist
+from colorama import init as colorama_init
+
+import iepy
+iepy.setup(__file__)
+
+from iepy.data import models
+from iepy.data.models import EvidenceCandidate
+from iepy.data.db import CandidateEvidenceManager
+from iepy.extraction.terminal import TerminalEvidenceFormatter
+from iepy.extraction.rules import (
+    load_rules, compile_rule, generate_tokens_to_match
+)
+from iepy.metrics import result_dict_from_predictions
+
+
+logging.basicConfig(level=logging.INFO, format='%(message)s')
+
+
+def run_from_command_line():
+    opts = docopt(__doc__, version=iepy.__version__)
+    relation_name = opts.get("<relation>")
+    limit = opts.get("--limit")
+    rule_name = opts.get("--rule")
+    shuffle = opts.get("--shuffle")
+    create_evidences = opts.get("--create-evidences")
+
+    if limit is None:
+        limit = -1
+
+    try:
+        limit = int(limit)
+    except ValueError:
+        logging.error("Invalid limit value, it must be a number")
+        sys.exit(1)
+
+    try:
+        relation = models.Relation.objects.get(name=relation_name)
+    except ObjectDoesNotExist:
+        logging.error("Relation {!r} not found".format(relation_name))
+        sys.exit(1)
+
+    # Load rules
+    rules = get_rules(rule_name)
+    rule_regexes = [
+        (rule.__name__, compile_rule(rule, relation), rule.answer) for rule in rules
+    ]
+
+    # Load evidences
+    if EvidenceCandidate.objects.all().count() == 0:
+        create_evidences = True
+    evidences = CandidateEvidenceManager.candidates_for_relation(
+        relation, create_evidences, seg_limit=limit, shuffle_segs=shuffle
+    )
+    conflict_solver = CandidateEvidenceManager.conflict_resolution_newest_wins
+    answers = CandidateEvidenceManager.labels_for(
+        relation, evidences, conflict_solver
+    )
+    run_tests(rule_regexes, evidences, answers)
+
+
+def run_tests(rule_regexes, evidences, answers):
+    predictions = []
+    real_labels = []
+    evidences_with_labels = []
+
+    colorama_init()
+    formatter = TerminalEvidenceFormatter()
+
+    for name, regex, answer in rule_regexes:
+        title = "Matches for rule '{}' (value: {})".format(name, answer)
+        print("\n{}\n{}".format(title, "-" * len(title)))
+
+        anything_matched = False
+        for evidence in evidences:
+            tokens_to_match = generate_tokens_to_match(evidence)
+            match = refo.match(regex, tokens_to_match)
+
+            if match:
+                anything_matched = True
+                print("  * {}".format(formatter.colored_text(evidence)))
+
+            if evidence in answers and answers[evidence] is not None:
+                evidences_with_labels.append(evidence)
+                real_labels.append(answers[evidence])
+
+                if match:
+                    predictions.append(answer)
+                else:
+                    predictions.append(False)
+
+        if not anything_matched:
+            print("  nothing matched")
+
+        print()
+
+    if real_labels:
+        results = result_dict_from_predictions(
+            evidences_with_labels, real_labels, predictions
+        )
+        results.pop("end_time")
+        keys = [
+            "true_positives", "true_negatives",
+            "false_positives", "false_negatives",
+            "precision", "recall",
+            "accuracy", "f1",
+        ]
+
+        title = "Metrics"
+        print("{}\n{}".format(title, "-" * len(title)))
+        for key in keys:
+            print("{:>15}: {:.2f}".format(key, results[key]))
+
+
+def get_rules(rule_name):
+    # Load rules
+    rules = load_rules()
+
+    if rule_name:
+        rules = [x for x in rules if x.__name__ == rule_name]
+        if not rules:
+            logging.error("rule '{}' does not exists".format(rule_name))
+            sys.exit(1)
+
+    return rules
+
+
+if __name__ == "__main__":
+    run_from_command_line()

+ 251 - 0
examples/test/bin/settlement.py

@@ -0,0 +1,251 @@
+
+
+
+
+from django.db.models import Q
+import datetime,time
+import iepy
+iepy.setup(__file__)
+from iepy.data.db import DocumentManager
+from iepy.data.models import IEDocument,LabeledIEDocument,IEDocumentMetadata,LabeledIEDocumentMetadata,Payroll
+from brat.models import BratAnnotation,LabeledBratAnnotation
+from django.db import transaction
+import pandas as pd
+from django.contrib.auth.models import User
+
+def object_to_dict(obj,class_model):
+    '''
+    :param obj:对象
+    :param class_model:django model
+    :return: 由对象生成的键值对
+    '''
+    _dict = {}
+    concrete_model = class_model._meta.concrete_model
+    for field in concrete_model._meta.local_fields:
+        value = field.value_from_object(obj)
+        _dict[field.name] = value
+    return _dict
+
+
+class Settlement():
+
+    '''
+    @summary: 结算类,定义了结算者所需要执行的各种方法
+    '''
+
+    def makePayroll(self,list_user,begin_time,time_end):
+        '''
+        :param _user: 用户名
+        :param time_begin: 起始时间
+        :param time_end: 截至时间
+        :return:根据用户,时间段生成用户的标注情况
+        '''
+        from django.db import connection
+        with transaction.atomic():
+            cursor = connection.cursor()
+            time_begin = "2020-08-01"
+            for _user in list(set(list_user)):
+                sql = 'select max(end_time) from corpus_payroll where "user"=\'%s\''%(_user)
+                cursor.execute(sql)
+                rows = cursor.fetchall()
+                if rows[0][0] is not None:
+                    time_begin = rows[0][0]
+                else:
+                    time_begin = "2020-08-01"
+
+                sql = " select count(1) from corpus_iedocument where edituser='%s' and to_char(edittime,'yyyy-mm-dd')<='%s' and to_char(edittime,'yyyy-mm-dd')>'%s'"%(_user,time_end,time_begin)
+                cursor.execute(sql)
+                doc_count = cursor.fetchall()[0][0]
+                sql = " select count(1) from brat_bratannotation where document_id in (select human_identifier from corpus_iedocument where edituser='%s' and to_char(edittime,'yyyy-mm-dd')<='%s' and to_char(edittime,'yyyy-mm-dd')>'%s' order by edittime desc limit 1200) and value like '%s' "%(_user,time_end,time_begin,"T%")
+                cursor.execute(sql)
+                t_count = cursor.fetchall()[0][0]
+                sql = " select count(1) from brat_bratannotation where document_id in (select human_identifier from corpus_iedocument where edituser='%s' and to_char(edittime,'yyyy-mm-dd')<='%s' and to_char(edittime,'yyyy-mm-dd')>'%s' order by edittime desc limit 1200) and value like '%s' "%(_user,time_end,time_begin,"R%")
+                cursor.execute(sql)
+                r_count = cursor.fetchall()[0][0]
+                sql = " select count(1) from brat_bratannotation where document_id in (select human_identifier from corpus_iedocument where edituser='%s' and to_char(edittime,'yyyy-mm-dd')<='%s' and to_char(edittime,'yyyy-mm-dd')>'%s') and value like '%s' "%(_user,time_end,time_begin,"T%")
+                cursor.execute(sql)
+                all_t_count = cursor.fetchall()[0][0]
+                sql = " select count(1) from brat_bratannotation where document_id in (select human_identifier from corpus_iedocument where edituser='%s' and to_char(edittime,'yyyy-mm-dd')<='%s' and to_char(edittime,'yyyy-mm-dd')>'%s') and value like '%s' "%(_user,time_end,time_begin,"R%")
+                cursor.execute(sql)
+                all_r_count = cursor.fetchall()[0][0]
+                wage = round(0.03*t_count+0.05*r_count+(all_t_count-t_count)*0.04+(all_r_count-r_count)*0.06,2)
+                print(doc_count,t_count,r_count,wage)
+                payrolls = Payroll.objects.filter(Q(user=_user)& Q(begin_time=time_begin) & Q(end_time=time_end))
+                if len(payrolls)==0:
+                    _payroll = Payroll.objects.create(**{"user":_user,"doc_count":doc_count,"begin_time":time_begin,"end_time":time_end,"t_count":all_t_count,"r_count":all_r_count,"wage":wage})
+                    _payroll.save()
+                else:
+                    _payroll = payrolls[0]
+                    _payroll.doc_count = doc_count
+                    _payroll.t_count = all_t_count
+                    _payroll.r_count = all_r_count
+                    _payroll.wage = wage
+                    _payroll.save()
+
+    def exportPayroll(self,begin_time,end_time):
+        '''
+        :param begin_time: 导出开始时间
+        :param end_time: 导出结束时间
+        :return:
+        '''
+        list_user = []
+        list_doc_count = []
+        list_t_count = []
+        list_r_count = []
+        list_wage = []
+        list_yield = []
+        list_account = []
+        list_begin = []
+        list_end = []
+        if begin_time is not None:
+            payrolls = Payroll.objects.filter(Q(begin_time=begin_time) & Q(end_time=end_time))
+        else:
+            payrolls = Payroll.objects.filter(Q(end_time=end_time))
+        for _payroll in payrolls:
+            list_user.append(_payroll.user)
+            list_doc_count.append(_payroll.doc_count)
+            list_t_count.append(_payroll.t_count)
+            list_r_count.append(_payroll.r_count)
+            list_wage.append(_payroll.wage)
+            list_yield.append(_payroll._yield)
+            list_account.append(_payroll.account)
+            list_begin.append(_payroll.begin_time)
+            list_end.append(_payroll.end_time)
+        df = pd.DataFrame({"用户":list_user,"开始时间":list_begin,"结束时间":list_end,"文章数":list_doc_count,"要素数":list_t_count,"关系数":list_r_count,"总价":list_wage,"合格率":list_yield,"结算价":list_account})
+        df.to_excel("%s-%s要素标注统计.xls"%(begin_time,end_time),columns=["用户","开始时间","结束时间","文章数","要素数","关系数","总价","合格率","结算价"])
+
+    def getAllUser(self):
+        from django.db import connection
+        with transaction.atomic():
+            list_user = []
+            cursor = connection.cursor()
+            sql = "select username from auth_user where is_staff='t'"
+            cursor.execute(sql)
+            for row in cursor.fetchall():
+                list_user.append(row[0])
+            return list_user
+
+
+    def makeMigrate(self,_user,time_begin,time_end):
+        '''
+        :param _user: 用户名
+        :param time_begin: 起始时间
+        :param time_end: 截至时间
+        :return: 将用户在时间段内的数据迁移到标准表中
+        '''
+        pass
+        # from django.db import connection
+        # with transaction.atomic():
+        #     cursor = connection.cursor()
+        #     sql = " select human_identifier,offsets_to_text,sentences from corpus_iedocument where edituser is null"
+        #     cursor.execute(sql)
+        #     cursor1 = connection.cursor()
+        #     _index = 0
+        #     rows = True
+        #     while(rows):
+        #         rows=cursor.fetchmany(1000)
+        #         for row in rows:
+        #             _index += 1
+        #             print(_index)
+        #             human_identifier,offsets_to_text,sentences = row
+        #             if sentences!="[]":
+        #                 _off = offsets_to_text.split(", ")[-1][:-1]
+        #                 _sen = sentences.split(", ")[-1][:-1]
+        #                 print(_off,_sen)
+        #                 if int(_off)!=int(_sen):
+        #                     offsets_to_text = offsets_to_text[:-1]+", "+str(int(_sen))+"]"
+        #                     print(offsets_to_text)
+        #                     cursor1.execute("update corpus_iedocument set offsets_to_text='%s' where human_identifier='%s'"%(offsets_to_text,human_identifier))
+
+
+
+            # ieDocuments = IEDocument.objects.filter(Q(edituser=_user) & Q(edittime__range=(time_begin,time_end)))
+            # for obj in ieDocuments:
+            #     _dict = object_to_dict(obj,IEDocument)
+            #     _dict_meta = object_to_dict(obj.metadata,IEDocumentMetadata)
+            #     labeledMeta = LabeledIEDocumentMetadata.objects.create(**_dict_meta)
+            #     labeledMeta.save()
+            #     _dict["metadata"] = labeledMeta
+            #     tmp = LabeledIEDocument.objects.create(**_dict)
+            #     tmp.save()
+            #
+            #     bratAnnotations = BratAnnotation.objects.filter(Q(document_id=obj.human_identifier))
+            #     for ann in bratAnnotations:
+            #         _dict_ann = object_to_dict(ann,BratAnnotation)
+            #         labeledAnn = LabeledBratAnnotation.objects.create(**_dict_ann)
+            #         labeledAnn.save()
+
+
+    def getPercentOfPass(self,_user,time_begin,time_end):
+        '''
+        :param _user:用户名
+        :param time_begin: 起始时间
+        :param time_end: 截至时间
+        :return: 获得用户在时间段内标注数据的合格率
+        '''
+
+    def makePayrolls(self,time_begin,time_end):
+        '''
+        :param time_begin:起始时间
+        :param time_end: 截至时间
+        :return: 获得所有用户的工资表
+        '''
+        for _user in self.getAllUser():
+            self.makePayroll(_user,time_begin,time_end)
+        self.exportPayroll(time_begin,time_end)
+
+    def createUser_batch(self,batch_size=90):
+        '''
+        :param batch_size: 用户个数
+        :return:
+        '''
+        list_user = [User.objects.create_user(username="bidi%d"%(i+1),password="bidi%d"%(i+1)) for i in range(batch_size)]
+
+    def exportLabels(self):
+        groups = [[1,7],[8,14],[15,22],[23,29],[30,36],[37,43],[44,50],[51,56],[57,62],[63,71]]
+        from django.db import connection
+        cursor = connection.cursor()
+        for _i in range(len(groups)):
+            _begin,_end = groups[_i]
+            list_username = []
+            list_user = []
+            list_label = []
+            list_time = []
+            for _j in range(_begin,_end+1):
+                username = "bidi%d"%_j
+                list_username.append("'%s'"%username)
+            sql = " select edituser,human_identifier,to_char(edittime,'yyyy-mm-dd') from corpus_iedocument where edituser in(%s) order by edittime asc"%(",".join(list_username))
+            print(sql)
+            cursor.execute(sql)
+            rows = cursor.fetchall()
+            for row in rows:
+                list_user.append(row[0])
+                list_label.append(row[1])
+                list_time.append(row[2])
+            df = pd.DataFrame({"时间":list_time,"用户":list_user,"文章编号":list_label})
+            df.to_excel("分组_%d.xls"%(_i+1),columns=["时间","用户","文章编号"])
+
+    def filter(self):
+        '''
+        过滤拍卖公告
+        :return:
+        '''
+        import re
+        ieDocuments = IEDocument.objects.all()
+        for obj in ieDocuments:
+            if re.search("拍卖",obj.text) is not None:
+                obj.jump_signal = 1
+                obj.save()
+                print(obj.human_identifier)
+
+
+
+if __name__=="__main__":
+    settle = Settlement()
+    # settle.makeMigrate("test","2020-08-01","2020-08-31")
+    settle.makePayroll(["test3","test19","test22","test2","test9","test11","test12","test1","test7","test21","test17"],"2020-08-01","2020-12-25")
+    # settle.makePayrolls("2020-08-01","2020-08-31")
+    settle.exportPayroll(begin_time=None,end_time='2020-12-25')
+    # settle.createUser_batch(batch_size=102)
+    # settle.exportLabels()
+    # settle.filter()

BIN
examples/test/bin/分组_1.xls


BIN
examples/test/bin/分组_10.xls


BIN
examples/test/bin/分组_2.xls


BIN
examples/test/bin/分组_3.xls


BIN
examples/test/bin/分组_4.xls


BIN
examples/test/bin/分组_5.xls


BIN
examples/test/bin/分组_6.xls


BIN
examples/test/bin/分组_7.xls


Certains fichiers n'ont pas été affichés car il y a eu trop de fichiers modifiés dans ce diff