1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621 |
- #coding:UTF-8
- # !/usr/bin/python
- # -*- coding: <utf-8> -*-
- import ast
- import copy
- import re
- import sys
- import os
- import time
- import codecs
- from datetime import datetime
- import psycopg2
- import pandas as pd
- sys.setrecursionlimit(1000000)
- sys.path.append(os.path.abspath("../.."))
- sys.path.append(os.path.abspath(".."))
- # 数据预处理,将数据转为BIO标注类型
- def Postgre2Data():
- # 连接postgresql数据库
- connect = psycopg2.connect(database="iepy", user="iepy_read", password="iepy_read", host="192.168.2.101",
- port="5432")
- cursor = connect.cursor()
- cursor1 = connect.cursor()
- # 执行语句:先筛选出已审核通过的用户和时间段
- # cursor1.execute("SELECT a.user, begin_time, end_time"
- # " FROM corpus_payroll a"
- # " ORDER BY a.user")
- #
- # rows1 = cursor1.fetchall()
- # # 循环,根据筛选条件循环查另一条SQL,并保存结果
- # result = []
- # for row in rows1:
- # # 执行语句:取语料库中的文章id,分词结果,句子分割index
- # cursor.execute("select human_identifier, tokens, sentences, text, edittime, edituser"
- # " from corpus_iedocument"
- # " where date(edittime) <= '" + row[2] + "'" +
- # " and date(edittime) >= '" + row[1] + "'" +
- # " and edituser = '" + row[0] + "'")
- # # + " limit 5")
- # # cursor.execute("select human_identifier, tokens, sentences, text, edittime, edituser"
- # # " from corpus_iedocument"
- # # " where date(edittime) >= '" + "2020-08-01" + "'" +
- # # " and date(edittime) <= '" + "2020-08-31" + "'" +
- # # " and edituser = '" + "test1" + "'" +
- # # " limit 10")
- #
- # # 获取SELECT 返回的元组
- # rows = cursor.fetchall()
- # for row in rows:
- # result.append(row)
- result = []
- cursor.execute("SELECT human_identifier, tokens, sentences, text, edittime, edituser"
- " FROM corpus_iedocument"
- " where edituser is not NULL")
- # + " limit 30")
- rows = cursor.fetchall()
- for row in rows:
- result.append(row)
- print(len(result))
- human_identifier = []
- tokens = []
- sentences = []
- text = []
- corpus_iedocument = []
- for row in result:
- human_identifier.append(row[0])
- s = row[1]
- s = s.replace("[", "").replace("]", "").replace("\'", "")
- ss = s.split(", ")
- sss = []
- for s1 in ss:
- sss.append(s1)
- tokens.append(sss)
- sentences.append(row[2])
- text.append(row[3])
- corpus_iedocument.append(human_identifier)
- corpus_iedocument.append(tokens)
- corpus_iedocument.append(sentences)
- corpus_iedocument.append(text)
- # print(corpus_iedocument[0])
- # 循环每个documentid,取出对应标注结果。
- # 返回二维列表,第一维是document,第二维是document_id和value
- brat_labeledbratannotation = []
- for i in range(len(corpus_iedocument[0])):
- document = []
- document_id = []
- value = []
- # 执行语句,取brat人工标注库中的文章id,标注的结果
- cursor.execute('select document_id, value from brat_bratannotation '
- + 'where document_id = \'' + corpus_iedocument[0][i] + '\'')
- rows = cursor.fetchall()
- for row in rows:
- if (row[1][0] != 'T'):
- continue
- # print(row[1][0])
- document_id.append(row[0])
- value.append(row[1])
- document.append(document_id)
- document.append(value)
- brat_labeledbratannotation.append(document)
- # 关闭游标
- cursor.close()
- # cursor1.close()
- # 关闭数据库连接
- connect.close()
- return corpus_iedocument, brat_labeledbratannotation
- def Text2Csv():
- corpus_iedocument, brat_labeledbratannotation = Postgre2Data()
- # text_df = pd.DataFrame(columns=("document_id", "text", "value"))
- text_list = []
- document_id_list = []
- manual_BIO_list = []
- category_list = []
- word_list = []
- # 循环:每篇Document
- for index in range(len(corpus_iedocument[3])):
- text = corpus_iedocument[3][index]
- document_id = brat_labeledbratannotation[index][0]
- manual_BIO = brat_labeledbratannotation[index][1]
- # 循环:处理人工标注的数据,结构化,取联系人类型和单词index,并对数组按单词index排序
- for j in range(len(manual_BIO)):
- categoryAndIndex = manual_BIO[j].replace(" ", " ").split(" ")[1:]
- category = categoryAndIndex[0]
- word = categoryAndIndex[-1]
- document_id_list.append(document_id[j])
- text_list.append(text)
- category_list.append(category)
- word_list.append(word)
- manual_BIO_list.append(categoryAndIndex)
- text_dict = {'document_id': document_id_list, 'text': text_list, 'word': word_list, 'category': category_list, 'categoryAndIndex': manual_BIO_list}
- text_df = pd.DataFrame(text_dict)
- # text_df.columns = ['document_id', 'text', 'word', 'category', 'categoryAndIndex']
- text_df.to_csv("C:\\Users\\admin\\Desktop\\text.csv")
- return
- def Csv2BidwayText():
- df = pd.read_csv("C:\\Users\\admin\\Desktop\\text.csv")
- df = df[df["category"] == "bidway"]
- df.columns = ["index", "category", "categoryAndIndex", "document_id", "text", "word"]
- df = df.reset_index()
- df = df[["document_id", "text", "categoryAndIndex", "word", "category"]]
- df.to_csv("C:\\Users\\admin\\Desktop\\bidway_text.csv")
- def Csv2ServiceTimeText():
- df = pd.read_csv("C:\\Users\\admin\\Desktop\\text.csv")
- df = df[df["category"] == "serviceTime"]
- df.columns = ["index", "category", "categoryAndIndex", "document_id", "text", "word"]
- df = df.reset_index()
- df = df[["document_id", "text", "categoryAndIndex", "word", "category"]]
- df.to_csv("C:\\Users\\admin\\Desktop\\serviceTime_text.csv")
- def data2BIOData():
- corpus_iedocument, brat_labeledbratannotation = Postgre2Data()
- # 单词list
- words_list = [0]
- words_list_all = []
- # 单词的BIO标注列表
- word_BIO_list_all = []
- # 句子列表
- sentences_list_all = []
- manual_BIO_list = []
- # 单词在句子中的index列表
- wordInSentencesIndex_list_all = []
- # 单词对应的句子的编号:0~句子条数
- wordInSentencesNumber_list_all = []
- # 单词对应的句子分词token列表
- wordInSentencesTokens_list_all = []
- # 循环:documment篇数
- for i in range(len(corpus_iedocument[0])):
- categoryAndIndex_list = []
- words_list[0] = corpus_iedocument[1][i]
- words_list_all.append(corpus_iedocument[1][i])
- manual_BIO_list = brat_labeledbratannotation[i][1]
- # 循环:处理人工标注的数据,结构化,取联系人类型和单词index,并对数组按单词index排序
- for data in manual_BIO_list:
- categoryAndIndex = data.replace(" ", " ").split(" ")[1:4]
- categoryAndIndex_list.append(categoryAndIndex)
- categoryAndIndex_list = sorted(categoryAndIndex_list, key=lambda c: int(c[1]), reverse=False)
- # 循环:将该篇Document的句子分出来
- index_begin = 0
- formatted_sentence_index = corpus_iedocument[2][i][1:-1].split(",")
- sentences_list = []
- for index in range(1, len(formatted_sentence_index)):
- s = corpus_iedocument[3][i][index_begin: int(formatted_sentence_index[index])]
- index_begin = int(formatted_sentence_index[index])
- sentences_list.append(s)
- sentences_list_all.append(sentences_list)
- # 处理数据,成为BIO标注类型,即每个单词都有一个对应的标注
- # 对每个人工标注循环找,并对index跨单词进行标注
- # 单个单词多个标注就用列表全部存储
- # 并对单词输出其所在句子的index,和所在句子的编号
- word_BIO_list = [[0]] * len(words_list[0])
- # 循环:一篇document中所有Label和下标
- for index in range(len(categoryAndIndex_list)):
- word_index = 0
- # 标识上一个标注是否为B,记录上个标识,并记录最后位置
- tag_flag = ""
- tag_index = 0
- # 单词index标识
- word_flag = 0
- # 循环:对一个Label和Index循环所有单词
- for word in words_list[0]:
- if word_index == int(categoryAndIndex_list[index][1]):
- # 如果原来有标注的类,就添加;没有则赋值
- if word_BIO_list[word_flag][0] != 0 \
- and ("B-" + categoryAndIndex_list[index][0]) not in word_BIO_list[word_flag]:
- word_BIO_list[word_flag].append("B-" + categoryAndIndex_list[index][0])
- else:
- word_BIO_list[word_flag] = ["B-" + categoryAndIndex_list[index][0]]
- tag_flag = categoryAndIndex_list[index][0]
- tag_index = int(categoryAndIndex_list[index][2])
- # print(word, " ", "B-"+categoryAndIndex_list[index][0])
- elif word_index < tag_index - 1 and tag_flag != "":
- if word_BIO_list[word_flag][0] != 0 \
- and ("I-" + tag_flag) not in word_BIO_list[word_flag]:
- word_BIO_list[word_flag].append("I-" + tag_flag)
- else:
- word_BIO_list[word_flag] = ["I-" + tag_flag]
- word_flag += 1
- word_index += len(word)
- # 有些空白word
- if word is None or word == "":
- word_index += 1
- # 循环:将其余Label置为O
- for index in range(len(word_BIO_list)):
- if word_BIO_list[index][0] == 0:
- word_BIO_list[index] = ["O"]
- word_BIO_list_all.append(word_BIO_list)
- # 输出每个单词在句子中的index和在第几条句子;之前的单词index是全文的index。
- # 并输出每个单词对应的句子的分词Tokens
- wordInSentencesIndex_list = []
- wordInSentencesNumber_list = []
- wordInSentencesTokens_list = []
- sentence_number = 0
- sentences_index_list = corpus_iedocument[2][i][1:-1].split(", ")
- word_index = 0
- # 循环:所有单词
- for index in range(len(words_list[0])):
- # 判断在第几个句子
- # print("word_index", word_index, sentence_number, len(sentences_index_list))
- if sentence_number + 1 >= len(sentences_index_list) or word_index < int(
- sentences_index_list[sentence_number + 1]):
- wordInSentencesNumber_list.append(sentence_number)
- else:
- sentence_number += 1
- if sentence_number >= len(sentences_index_list):
- break
- wordInSentencesNumber_list.append(sentence_number)
- # 输出该单词在该句子的index
- if sentences_index_list[sentence_number] == "":
- continue
- wordInSentence_begin_index = word_index - int(sentences_index_list[sentence_number])
- if words_list[0][index] is None or words_list[0][index] == "":
- wordInSentence_end_index = wordInSentence_begin_index + 1
- else:
- wordInSentence_end_index = wordInSentence_begin_index + len(words_list[0][index])
- wordInSentencesIndex_list.append(str(wordInSentence_begin_index) + "," + str(wordInSentence_end_index))
- # 根据句子编号输出句子Tokens
- if wordInSentencesNumber_list[index] < len(sentences_list):
- wordInSentencesTokens_list.append(sentences_list[wordInSentencesNumber_list[index]])
- else:
- wordInSentencesTokens_list.append(sentences_list[-1])
- # # 输出该单词在该句子的index
- # if sentences_index_list[sentence_number] == "":
- # # print("句子序号为'': ")
- # # print(sentences_index_list, len(sentences_index_list), sentence_number)
- # continue
- #
- # for j in range(len(sentences_list[wordInSentencesNumber_list[index]])):
- #
- # wordInSentence_begin_index = word_index - int(sentences_index_list[sentence_number])
- # if words_list[0][index] is None or words_list[0][index] == "":
- # wordInSentence_end_index = wordInSentence_begin_index + 1
- # else:
- # wordInSentence_end_index = wordInSentence_begin_index + len(words_list[0][index])
- # wordInSentencesIndex_list.append(str(wordInSentence_begin_index) + "," + str(wordInSentence_end_index))
- word_index += len(words_list[0][index])
- # 有些空白word
- if words_list[0][index] is None or words_list[0][index] == "":
- word_index += 1
- wordInSentencesIndex_list_all.append(wordInSentencesIndex_list)
- wordInSentencesNumber_list_all.append(wordInSentencesNumber_list)
- wordInSentencesTokens_list_all.append(wordInSentencesTokens_list)
- # print("wordInSentencesTokens_list", wordInSentencesTokens_list)
- return words_list_all, word_BIO_list_all, wordInSentencesIndex_list_all, wordInSentencesTokens_list_all
- def BIOData2TXT():
- words_list_all, word_BIO_list_all, \
- wordInSentencesIndex_list_all, wordInSentencesTokens_list_all = data2BIOData()
- print(words_list_all)
- print(type(word_BIO_list_all))
- print(len(wordInSentencesIndex_list_all))
- print(len(wordInSentencesTokens_list_all))
- file = open('C:\\Users\\admin\\Desktop\\BIOData_list.txt', 'w', encoding='utf-8')
- file.write(str([words_list_all, word_BIO_list_all, wordInSentencesIndex_list_all, wordInSentencesTokens_list_all]))
- file.close()
- return
- def TXT2BIOData():
- start_time = time.time()
- file = open('C:\\Users\\admin\\Desktop\\BIOData_list.txt', 'r', encoding='utf-8')
- str1 = file.read()
- list1 = ast.literal_eval(str1)
- file.close()
- # print(list1[0])
- # print(type(list1[1]))
- # print(len(list1[2]))
- # print(len(list1[3]))
- end_time = time.time()
- print("耗时:", end_time-start_time)
- return list1[0], list1[1], list1[2], list1[3]
- def BIOData2DataFrame():
- words_list_all, word_BIO_list_all, _, _ = data2BIOData()
- # print(words_list_all)
- # print(word_BIO_list_all)
- df = pd.DataFrame([words_list_all[0], word_BIO_list_all[0]])
- df = df.T
- for index in range(len(words_list_all)):
- if index == 0:
- continue
- df = df.append(pd.DataFrame([words_list_all[index], word_BIO_list_all[index]]).T)
- # print(df)
- df.columns = ["Word", "BIO"]
- df.to_csv("C:\\Users\\admin\\Desktop\\BIO.csv")
- def PersonBIOData2BIO_Sentence():
- words_list_all, word_BIO_list_all, _, _ = data2BIOData()
- # words_list_all, word_BIO_list_all, _, _ = TXT2BIOData()
- # df = pd.DataFrame([words_list_all[0], word_BIO_list_all[0]])
- # df = df.T
- df = pd.DataFrame()
- # 对每个Document
- for index in range(len(words_list_all)):
- list1 = word_BIO_list_all[index]
- new_list = []
- # 对每个BIO对
- for i in range(len(list1)):
- str1 = ""
- for j in range(len(list1[i])):
- if list1[i][j][2:8] == "person":
- if str1 == "":
- str1 = list1[i][j]
- elif str1 != "O":
- str1 = str1 + "," + list1[i][j]
- else:
- str1 = "O"
- new_list.append(str1)
- df = df.append(pd.DataFrame([words_list_all[index], new_list]).T)
- df.columns = ["Word", "BIO"]
- # 将I-person转为B-person,因为一个模型只判断一类
- # df["BIO"] = df["BIO"].apply(lambda x: "B" + x[1:] if x[0] == "I" else x)
- # print(df[df["BIO"]])
- # print(df)
- # df.to_csv("C:\\Users\\admin\\Desktop\\Person_BIO.csv")
- # 合并B-person和I-person为B-person
- tag_flag = ""
- delete_index_list = []
- df = df.reset_index()
- df = df[["Word", "BIO"]]
- for index, row in df.iterrows():
- if row["BIO"][0] == "B":
- tag_flag = row["BIO"]
- elif row["BIO"][0] == "I" and tag_flag != "":
- df["Word"].iloc[index-1] = df["Word"].iloc[index-1] + df["Word"].iloc[index]
- # df1["end_index"].iloc[index-1] = int(df1["end_index"].iloc[index-1]) + len(df["Word"].iloc[index])
- delete_index_list.append(index)
- else:
- tag_flag = ""
- df = df.drop(delete_index_list)
- # df1 = df1.drop(delete_index_list)
- # 取标注为person_person的词的前35个词,后3个词作为一个句子
- sentences = []
- for index in range(len(df["BIO"])):
- sentence = ""
- if df["BIO"].iloc[index] != "O":
- sentence1 = ""
- sentence2 = ""
- if index > 60 or len(df["BIO"]) - index < 60:
- for i in range(60, 0, -1):
- sentence1 = sentence1 + df["Word"].iloc[index - i] + " "
- for i in range(1, 61):
- sentence2 = sentence2 + df["Word"].iloc[index + i] + " "
- sentence = sentence + sentence1 + "||" + df["Word"].iloc[index] + "||" + sentence2
- else:
- sentence = None
- else:
- sentence = None
- sentences.append(sentence)
- df["Sentence"] = sentences
- # 舍弃BIO为O的行
- df = df.reset_index()
- df = df[["Word", "Sentence", "BIO"]]
- delete_index_list = []
- for index, row in df.iterrows():
- if row["BIO"] == "O":
- delete_index_list.append(index)
- df = df.drop(delete_index_list)
- df = df.reset_index()
- df = df[["Word", "Sentence", "BIO"]]
- # 判断类标签,0为人名,1为联系人,2为招标联系人,3为代理联系人,4为评审专家,5为其他非联系人
- df["Label"] = df["BIO"].apply(lambda x: 5 if x == "O" else (1 if x[9:] == "person" else (
- 2 if x[9:] == "tendereePerson" else (3 if x[9:] == "agencyPerson" else (0 if x[2:] == "person" else 4)))))
- df = df[["Word", "Label", "Sentence", "BIO"]]
- df.to_csv("C:\\Users\\admin\\Desktop\\Person_Sentence_all_60.csv")
- # df["Sentence"] = df["BIO"].apply(lambda x: x if x[9:] == "person" else x)
- def BIOData2PersonData():
- words_list_all, word_BIO_list_all, \
- wordInSentencesIndex_list_all, wordInSentencesTokens_list_all = data2BIOData()
- df = pd.DataFrame()
- # 循环:对每个Document
- for index in range(len(words_list_all)):
- list1 = word_BIO_list_all[index]
- new_list = []
- # 循环:一篇Document中的每个BIO对,判断Label是person的
- for i in range(len(list1)):
- str1 = ""
- for j in range(len(list1[i])):
- if list1[i][j][2:8] == "person":
- # print("==", list1[i][j])
- if str1 == "":
- str1 = list1[i][j]
- elif str1 != "O":
- str1 = str1 + "," + list1[i][j]
- else:
- str1 = "O"
- new_list.append(str1)
- df = df.append(pd.DataFrame([words_list_all[index], new_list]).T)
- df.columns = ["Word", "BIO"]
- # 循环:对每个Document
- df1 = pd.DataFrame()
- for index in range(len(words_list_all)):
- # 循环:一篇Document中的单词的begin_index,end_index,tokens
- begin_index = []
- end_index = []
- tokens = []
- for i in range(len(wordInSentencesIndex_list_all[index])):
- ss = wordInSentencesIndex_list_all[index][i].split(",")
- begin_index.append(ss[0])
- end_index.append(ss[1])
- tokens.append(wordInSentencesTokens_list_all[index][i])
- df1 = df1.append(pd.DataFrame([tokens, begin_index, end_index]).T)
- df1.columns = ["tokens", "begin_index", "end_index"]
- # print("df1.shape ", df1.shape)
- # print("df.shape ", df.shape)
- # 将I-person转为B-person,因为一个模型只判断一类
- # df["BIO"] = df["BIO"].apply(lambda x: "B" + x[1:] if x[0] == "I" else x)
- # 判断类标签,0为人名,1为联系人,2为招标联系人,3为代理联系人,4为评审专家,5为其他非联系人
- df["Label"] = df["BIO"].apply(lambda x: 5 if x == "O" else (1 if x[9:] == "person" else (
- 2 if x[9:] == "tendereePerson" else (3 if x[9:] == "agencyPerson" else (0 if x[2:] == "person" else 4)))))
- # 重置索引
- df = df.reset_index()
- df1 = df1.reset_index()
- # 合并B-person和I-person为B-person
- tag_flag = ""
- delete_index_list = []
- for index, row in df.iterrows():
- if row["BIO"][0] == "B":
- tag_flag = row["BIO"]
- elif row["BIO"][0] == "I" and tag_flag != "":
- df["Word"].iloc[index-1] = df["Word"].iloc[index-1] + df["Word"].iloc[index]
- df1["end_index"].iloc[index-1] = int(df1["end_index"].iloc[index-1]) + len(df["Word"].iloc[index])
- delete_index_list.append(index)
- else:
- tag_flag = ""
- df = df.drop(delete_index_list)
- df1 = df1.drop(delete_index_list)
- # 重置索引
- df = df.reset_index()
- df1 = df1.reset_index()
- df1 = pd.concat([df["Word"], df["Label"], df1["tokens"], df1["begin_index"], df1["end_index"]], axis=1)
- df1.columns = ["Word", "Label", "tokens", "begin_index", "end_index"]
- # 舍弃Label为5的行
- delete_index_list = []
- for index, row in df1.iterrows():
- if row["Label"] == 5:
- delete_index_list.append(index)
- df1 = df1.drop(delete_index_list)
- df1.reset_index()
- # 拼接列begin_index,end_index,tokens
- # begin_index = []
- # end_index = []
- # for index in range(len(wordInSentencesIndex_list_all)):
- # ss = wordInSentencesIndex_list_all[index].split(",")
- # begin_index.append(ss[0])
- # end_index.append(ss[1])
- # df["begin_index"] = pd.DataFrame(begin_index)
- # df
- # print(df1)
- df1.to_csv("C:\\Users\\admin\\Desktop\\Person_Data_all.csv")
- def BIOData2Bidway():
- words_list_all, word_BIO_list_all, \
- wordInSentencesIndex_list_all, wordInSentencesTokens_list_all = TXT2BIOData()
- df = pd.DataFrame()
- # 循环:对每个Document
- for index in range(len(words_list_all)):
- list1 = word_BIO_list_all[index]
- new_list = []
- # 循环:一篇Document中的每个BIO对,判断Label是bidway的
- for i in range(len(list1)):
- str1 = ""
- for j in range(len(list1[i])):
- if list1[i][j][2:8] == "bidway":
- # print("==", list1[i][j])
- if str1 == "":
- str1 = list1[i][j]
- elif str1 != "O":
- str1 = str1 + "," + list1[i][j]
- else:
- str1 = "O"
- new_list.append(str1)
- df = df.append(pd.DataFrame([words_list_all[index], new_list]).T)
- df.columns = ["Word", "BIO"]
- df.to_csv("C:\\Users\\admin\\Desktop\\Bidway_BIO.csv")
- return
- def BIOData2ServiceTime():
- words_list_all, word_BIO_list_all, \
- wordInSentencesIndex_list_all, wordInSentencesTokens_list_all = TXT2BIOData()
- df = pd.DataFrame()
- # 循环:对每个Document
- for index in range(len(words_list_all)):
- list1 = word_BIO_list_all[index]
- new_list = []
- # 循环:一篇Document中的每个BIO对,判断Label是bidway的
- for i in range(len(list1)):
- str1 = ""
- for j in range(len(list1[i])):
- if list1[i][j][2:] == "serviceTime":
- # print("==", list1[i][j])
- if str1 == "":
- str1 = list1[i][j]
- elif str1 != "O":
- str1 = str1 + "," + list1[i][j]
- else:
- str1 = "O"
- new_list.append(str1)
- df = df.append(pd.DataFrame([words_list_all[index], new_list]).T)
- df.columns = ["Word", "BIO"]
- df.to_csv("C:\\Users\\admin\\Desktop\\ServiceTime_BIO.csv")
- return
- def duplicateData(label, sample_rate):
- df = pd.read_csv("C:\\Users\\admin\\Desktop\\Person_Data_all_OverSample.csv")
- print(df.shape)
- df1 = df[df["Label"] == label]
- df1 = df1.sample(frac=sample_rate)
- df = df.append(df1)
- df.to_csv("C:\\Users\\admin\\Desktop\\Person_Data_all_OverSample.csv")
- print(df.shape)
- def resetAndShuffleData():
- df = pd.read_csv("C:\\Users\\admin\\Desktop\\Person_Sentence_Notest.csv")
- df = df.sample(frac=1).reset_index(drop=True)
- df = df.reset_index()
- # df = df[["Word", "Label", "tokens", "begin_index", "end_index"]]
- df = df[["Word", "Label", "Sentence", "BIO"]]
- df.to_csv("C:\\Users\\admin\\Desktop\\Person_Sentence_Notest.csv")
- def re_bidway():
- df = pd.read_csv("C:\\Users\\admin\\Desktop\\bidway_text.csv")
- reg = re.compile(u'(采购方式|竞价方式|招标方式|询价类型|交易方式|寻源策略|招标形式|询价方式'
- u'|发包方式|发包类型|开展方式|招标类型)(.*)'
- u'(公开招标|竞争性磋商|竞争性谈判|公开采购|单一来源'
- u'|电子书面竞投|邀请招标|定向公开|询价采购|抽签摇号'
- u'|网上电子投标|比质比价|询单|询比采购|比选|单一来源采购'
- u'|网上招标|其他'
- u'|竞谈竞价|网上直购|公开竞谈'
- u'|库内邀请|库内公开发包)')
- # reg = re.compile(u'(公开招标|竞争性磋商|竞争性谈判采购|公开采购|单一来源|网络竞价'
- # u'|竞争性谈判|公开询价|邀请招标|公开招募|公开询比价|电子书面竞投'
- # u'|网上电子投标|比质比价|定向询单|国内比选|电子竞价'
- # u'|公开招租|公开竞标方式|网上招标|公开招标|国内竞争性谈判'
- # u'|国内竞争性磋商|公开竞谈|定向询价|网上询价|网上竞价|公开比选|磋商采购|网上直购'
- # u'|库内邀请|询价采购|询比采购|分散采购|单一来源采购)')
- reg2 = re.compile(u'(采用|以|)'
- u'(公开招标|竞争性磋商|竞争性谈判|公开采购|单一来源'
- u'|竞争性谈判|询价|电子书面竞投|电子竞价'
- u'|网上电子投标|比质比价|询单|询比采购|比选|单一来源采购'
- u'|网上招标|分散采购'
- u'|竞谈竞价|网上直购|公开竞谈'
- u'|库内邀请)'
- u'(采购方式|方式)')
- reg1 = re.compile(
- # u'(公开招标|竞争性磋商|竞争性谈判采购|公开采购|单一来源采购|网络竞价|公开招商方式'
- # u'|竞争性谈判|公开询价|询价采购|邀请招标|公开招募|公开询比|电子书面竞投'
- # u'|网上电子投标|比质比价|定向询单|询比采购|国内比选|单一来源|公开选取|库内公开发包'
- # u'|公开招租|公开竞标方式|网上招标|公开招标|竞争性谈判|公开招投标'
- # u'|国内竞争性磋商|公开竞谈|定向询价|网上询价|网上竞价|公开比选|磋商采购|网上直购'
- # u'|国际公开竞争性招标)'
- u'(公开招标|竞争性磋商|竞争性谈判|公开采购|单一来源'
- u'|竞争性谈判|询价|电子书面竞投'
- u'|网上电子投标|比质比价|询单|询比采购|比选|单一来源采购'
- u'|网上招标|分散采购'
- u'|竞谈竞价|网上直购|公开竞谈'
- u'|库内邀请)'
- )
- reg1_not = re.compile(u'及单一来源|询价小组成员|除单一来源|竞争性谈判邀请函|询价记录')
- reg3 = re.compile(u'(采购方式:邀请|采购方式:公开|采购方式:询价|分散采购|公开招标|竞价|磋商|询比|竞标|邀请招标|公开招募|公开招租)')
- reg_standard = re.compile(u'(公开招标|竞争性磋商|竞争性谈判|单一来源'
- u'|竞争性谈判|询价|邀请招标|公开招募|询比|电子书面竞投'
- u'|网上电子投标|比质比价|询单|比选'
- u'|公开招租|网上招标|分散采购'
- u'|网上直购|公开竞谈|采购方式:邀请|采购方式:公开|采购方式:询价)'
- )
- text_list = df["text"].to_list()
- output_list = []
- for index in range(len(text_list)):
- input_str = text_list[index]
- # 把一些混淆的词先替换掉
- input_str = re.sub(reg1_not, "", input_str)
- match = reg.search(input_str)
- output_str = None
- # 根据正则表达式匹配
- if match:
- # 判断长度,截断
- if len(match.group()) >= 15:
- ss = re.split(",|\.|,|。|;|;", match.group())
- # 判断所需的字符串在哪一段
- for i in range(len(ss)):
- if re.search(reg1, ss[i]):
- output_str = ss[i]
- break
- else:
- output_str = match.group()
- else:
- match2 = re.search(reg2, input_str)
- if match2:
- output_str = match2.group()
- else:
- match1 = re.search(reg1, input_str)
- if match1:
- output_str = match1.group()
- # 再判断一次长度
- if output_str is not None:
- if len(output_str) >= 15:
- match2 = re.search(reg2, input_str)
- if match2:
- output_str = match2.group()
- if len(output_str) >= 15:
- match1 = re.search(reg1, input_str)
- if match1:
- output_str = match1.group()
- # 最后输出还为空,匹配一些易混淆的词
- if output_str is None:
- match3 = re.search(reg3, input_str)
- if match3:
- output_str = match3.group()
- if output_str is not None:
- if not re.search("分散采购|采购方式:邀请", output_str):
- # 公开采购转为公开招标
- output_str = re.sub("公开采购", "公开招标", output_str)
- # 去掉第一个字符冒号
- ss = re.split("::|:|:", output_str)
- output_str = ss[-1]
- # 去掉采购、方式、采用
- output_str = re.sub("(采购|方式|采用|出售|进行|直接(|现就本次|招标为)", "", output_str)
- # 使用标准标签过滤
- match4 = re.search(reg_standard, output_str)
- if match4:
- output_str = match4.group()
- output_list.append(output_str)
- df["re"] = pd.DataFrame(output_list)
- df.to_csv("C:\\Users\\admin\\Desktop\\bidway_text1.csv")
- def re_serviceTime():
- df = pd.read_csv("C:\\Users\\admin\\Desktop\\serviceTime_text.csv")
- # reg = re.compile(u'(周期|工期|服务期|服务时限|交货时间|履行期限|服务周期|交货期|供货期|合格工期'
- # u'|投标工期|设计工期|合格服务周期|总工期|施工工期|服务时间|流转期限|维护期限'
- # u'|完成时间|交付|服务期限|中标工期|项目周期|计划工期'
- # u')'
- # u'(.*)(日止|日内|年|年度|天|月|日|周内|年内)')
- reg0 = re.compile(u'(服务时间:|服务期限:)'
- u'([^至到]*)'
- u'(至|到)'
- u'([^日时]*)'
- u'(日|时)'
- )
- reg = re.compile(u'(周期|工期|服务期|服务时限|履行期限|服务周期|交货期|供货期|合格工期'
- u'|投标工期|设计工期|合格服务周期|总工期|施工工期|服务时间|流转期限|维护期限'
- u'|完成时间|交付|服务期限|中标工期|项目周期|计划工期'
- u')'
- # u'([^日止|日内|年|年度|月|日|周内|年内|\d+]*)'
- u'([^年月日\d+]*)'
- u'([\d+|一|二|三|四|五|六|七|八|九|十])'
- u'(日止|日内|年|年度|月|日|周内|年内|日历天|工作日|\d+日|\d+|起)'
- u'(个月|\(日历天\)|)')
- reg_not = re.compile(u'(工期延误|工期节点|工期管理|合同履行日期:见|服务期限截止|交付使用'
- u'|服务期限:1、|工期\(交货期\):|工期、)')
- reg1 = re.compile(u'(合同签订|签订合同|合同履行日期)'
- u'([^\d]*)'
- u'(\d+|一|二|三|四|五|六|七|八|九|十)'
- u'(个|)'
- u'(日止|日内|年|年度|月|日历天|日|周内|年内|工作日)'
- )
- reg2 = re.compile(u'(服务期限|履行期限|工期|服务期|维护期限|服务周期|工期,\(日历天\),'
- u'|服务期\(日历天\)|预定工期\(日历天\)|期限要求)'
- u'(:|:|)+'
- u'(\d+|一|二|三|四|五|六|七|八|九|十|两|贰|叁)'
- u'(日止|日内|年|年度|月|日历天|日|周内|年内|工作日|天|)'
- )
- text_list = df["text"].to_list()
- output_list = []
- for index in range(len(text_list)):
- input_str = text_list[index]
- input_str = re.sub(reg_not, "", input_str)
- output_str = ""
- unit = ""
- match0 = re.findall(reg0, input_str)
- if match0:
- ss = ""
- for i in range(len(match0)):
- s = ""
- for j in range(len(match0[i])):
- s = s + match0[i][j]
- ss = ss + s
- if i < len(match0)-1:
- ss = ss + " "
- output_str = ss
- # 太长的裁剪
- if len(output_str) >= 40:
- sss = output_str.split(",")
- output_str = sss[0]
- print("0: ", output_str)
- else:
- match = reg.findall(input_str)
- if match:
- ss = ""
- for i in range(len(match)):
- s = ""
- if "天" in match[i]:
- unit = "天"
- if "月" in match[i]:
- unit = "月"
- for j in range(2, len(match[i])):
- s = s + match[i][j] + unit
- ss = ss + s
- if i < len(match)-1:
- ss = ss + " "
- output_str = ss
- print(output_str)
- else:
- match1 = re.findall(reg1, input_str)
- if match1:
- ss = ""
- for i in range(len(match1)):
- s = ""
- if "天" in match[i]:
- unit = "天"
- if "月" in match[i]:
- unit = "月"
- for j in range(2, len(match1[i])):
- s = s + match1[i][j] + unit
- ss = ss + s
- if i < len(match1)-1:
- ss = ss + " "
- output_str = ss
- print("1: ", output_str)
- else:
- match2 = re.findall(reg2, input_str)
- if match2:
- ss = ""
- for i in range(len(match2)):
- s = ""
- for j in range(2, len(match2[i])):
- s = s + match2[i][j]
- ss = ss + s
- if i < len(match2)-1:
- ss = ss + " "
- output_str = ss
- print("2: ", output_str)
- output_list.append(output_str)
- # for index in range(len(text_list)):
- # input_str = text_list[index]
- # match = reg.search(input_str)
- # output_str = None
- # # 根据正则表达式匹配
- # if match:
- # # 匹配成功,先匹配冒号,再分割冒号后的第一个标点
- # match2 = re.search(u':|:', match.group())
- # if match2:
- # ss = re.split(",|\.|,|。|;|;", match.group()[match2.span()[0]:])
- # output_str = match.group()[:match2.span()[0]] + ss[0]
- # else:
- # ss = re.split(",|\.|,|。|;|;", match.group())
- # output_str = ss[0]
- #
- # # 再匹配一些特殊情况
- # # 匹配出太长的,就是需要截断
- # if len(output_str) >= 40:
- # ss = re.split(",|\.|,|。|;|;", output_str)
- # output_str = ss[0]
- # # 错误分类的:服务期限:1、资金来源:自筹资金
- # if re.search(u"来源|1、|资金", output_str):
- # output_str = None
- # # 有完成、交货这些字眼分割
- # if output_str is not None:
- # ss = re.split("完工|质量", output_str)
- # if len(ss) > 1:
- # output_str = ss[0]
- # else:
- # match1 = re.search(reg1, input_str)
- # if match1:
- # # 匹配成功,先匹配冒号,再分割冒号后的第一个标点
- # match2 = re.search(u':|:', match1.group())
- # if match2:
- # ss = re.split(",|\.|,|。|;|;", match1.group()[match2.span()[0]:])
- # output_str = match1.group()[:match2.span()[0]] + ss[0]
- # else:
- # ss = re.split(",|\.|,|。|;|;", match1.group())
- # output_str = ss[0]
- # # 再匹配一些特殊情况
- # # 匹配出太长的,就是需要截断
- # if len(output_str) >= 40:
- # ss = re.split(",|\.|,|。|;|;", output_str)
- # output_str = ss[0]
- df["re"] = pd.DataFrame(output_list)
- df = df[["document_id", "text", "categoryAndIndex", "word", "category", "re"]]
- df.to_csv("C:\\Users\\admin\\Desktop\\serviceTime_text1.csv")
- return
- def re_serviceTime2():
- df = pd.read_csv("C:\\Users\\admin\\Desktop\\serviceTime_text.csv")
- text_list = df["text"].to_list()
- output_list = []
- keyword = u'(' \
- u'工期/交货期/服务期|项目周期|工期\(交货期\)|计划工期|工期要求:|服务期|服务时限|履行期限|服务周期|供货期|合格工期' \
- u'|投标工期|设计工期|合格服务周期|总工期|服务时间|流转期限|维护期限|服务时限|交货期' \
- u'|完成时间|服务期限|中标工期|项目周期|期限要求|周期|工期:' \
- u')'
- # 替换 易混淆关键词
- reg_not = re.compile(u'(工期延误|工期节点|工期管理|合同履行日期:见|服务期限截止|交付使用'
- u'|服务期限:1、|工期、)|截止|合同签订日期:|保证金在合同签订'
- u'|工期情况|签订合同前,|计划工期内|服务期内|服务期限应按')
- # 匹配 特定词 + 数字
- # reg0 = re.compile(u'(工期/交货期/服务期|服务期限|服务期)'
- # u'(:)'
- # u'(\d+)')
- # 匹配 匹配 关键词 + 年月日时至年月日时止|年月日至年月日
- reg0 = re.compile(u'(服务期|服务期限|服务周期|服务时间)'
- u'([^至]*)'
- u'(至)'
- u'([^日天止]*)'
- u'(日|天|止)')
- # 匹配 特定词 + 数字 + 年月周天
- reg1 = re.compile(u'(工期/交货期/服务期|服务期限|服务期|工期,|工期要求|中介服务时限)'
- u'([^天年月日]*[\d+一二三四五六七两叁贰壹肆伍])'
- u'(天|个月|个日历天|年|日历天|日|\(日历天\)|\(天\))')
- # 匹配 特定词 + 数字 + 年月周天
- reg2 = re.compile(u'(合同签订|签订合同|合同履行日期)'
- u'([^\d年]*)'
- u'(\d+|一|二|三|四|五|六|七|八|九|十)'
- u'(个|)'
- u'(日止|日内|年|年度|月|日历天|日|周内|年内|工作日|天内)'
- )
- # 匹配 特定词 + (天/日历天) + 数字
- reg3 = re.compile(u'(工期,|工期|服务时间|服务期)'
- u'(\(日历天\),|\(日历天\)|\(天\))'
- u'([^\d+]*)'
- u'(\d+)')
- # 匹配 特定词 + (年) + 数字
- reg6 = re.compile(u'(服务期限)'
- u'(\(年\))'
- u'([^\d+]*)'
- u'(\d+)')
- # 匹配 关键词 + 数字 + 年/月/天
- reg4 = re.compile(keyword +
- u'([^天年月日]*)'
- u'([\d+一二三四五六七两叁贰壹肆伍])'
- u'(,|)'
- u'(天|个月|年|个日历天|日历天|日|\(日历天\)|\(天\))')
- # 匹配 关键词 + 年月日时至年月日时止
- # reg5 = re.compile(keyword +
- # u'([^至]*)'
- # u'(至)'
- # u'([^止]*)'
- # u'(止)')
- # 匹配 关键词 + 年月日至年月日
- # reg6 = re.compile(keyword +
- # u'([^至]*)'
- # u'(至)'
- # u'([^日天]*)'
- # u'(日|天)')
- # 匹配 优先级低的词 + 年月日
- reg5 = re.compile(u'(服务要求|服务时限)'
- u'([^年日]*)'
- u'(年|日)')
- for index in range(len(text_list)):
- # 初始化
- output_str = ""
- input_str = text_list[index]
- # 替换
- input_str = re.sub(reg_not, "", input_str)
- # 匹配
- if output_str == "":
- output_str = re_findAllResult(reg3, input_str, unit="天", index=2)
- if output_str == "":
- output_str = re_findAllResult(reg6, input_str, unit="年", index=2)
- if output_str == "":
- output_str0 = re_findAllResult(reg0, input_str, index=1)
- output_str1 = re_findAllResult(reg1, input_str, index=1)
- # 同时匹配两个表达式,如果一个是空就选另一个,两个皆不为空,判断长度
- if output_str0 == "" and output_str1 == "":
- output_str = ""
- elif output_str0 == "":
- output_str = output_str1
- elif output_str1 == "":
- output_str = output_str0
- else:
- if len(output_str0) >= 100:
- output_str = output_str1
- elif len(output_str0) >= len(output_str1):
- output_str = output_str0
- else:
- output_str = output_str1
- if output_str == "":
- output_str = re_findAllResult(reg2, input_str, index=2)
- if output_str == "":
- output_str = re_findAllResult(reg4, input_str, index=1)
- if output_str == "":
- output_str = re_findAllResult(reg5, input_str, index=1)
- # 将冒号删掉
- output_str = re.sub(":|:|限|交货期/服务期|,|\)|\(", "", output_str)
- # 字符串中包含断句符号,裁剪
- ss = re.split("。|,|;", output_str)
- output_str = ss[0]
- # 添加
- output_list.append(output_str)
- df["re"] = pd.DataFrame(output_list)
- df = df[["document_id", "text", "categoryAndIndex", "word", "category", "re"]]
- df.to_csv("C:\\Users\\admin\\Desktop\\serviceTime_text2.csv")
- def re_serviceTime3():
- df = pd.read_csv("C:\\Users\\admin\\Desktop\\serviceTime_text.csv")
- text_list = df["text"].to_list()
- # 初始化
- output_list = []
- text_index_list = []
- before = '(?P<before>'\
- '工期/交货期/服务期|工期,\(日历天\)|工期\(交货期\)|合格工期\(天\)|服务期限\(年\)|工期\(天\)' \
- '|工期要求|项目周期|工期\(交货期\)|计划工期\(服务期限\)|服务时限|履行期限|服务周期|供货期' \
- '|合格工期|计划工期\(服务期\)|服务期\(日历天\)|服务,期|交货\(完工\)时间|交付\(服务、完工\)时间' \
- '|交货时间|工期\(日历天\)' \
- '|服务期限为|计划工期|工期要求|服务期限|服务期' \
- '|投标工期|设计工期|合格服务周期|总工期|服务时间|流转期限|维护期限|服务时限|交货期|服务要求' \
- '|完成时间|服务期限|中标工期|项目周期|期限要求|周期|工期|供货期|合同履行日期|计划周期' \
- ')'
- before2 = '(?P<before2>' \
- '合同签订后|合同签订之日起|约|自合同签订之日起|开工后|不超过|签订合同后|系统开发' \
- '|合同签订之日起至|自合同签订之日|合同签定后|自签订合同之日起|自合同签订起' \
- '|自合同签订生效之日起|自合同签订后不超过|中选后|均为|合同签订日至|本项目合同期|' \
- ')'
- charac = '(?P<charac>' \
- '[::,,]*' \
- ')'
- center = '(?P<center>' \
- '[自]?\d+年\d+月\d+日至\d+年\d+月\d+日|\d+年\d+月\d+日|[\d一二三四五六七两叁贰壹肆伍]+' \
- ')'
- after = '(?P<after>' \
- '天|个月|年|个日历天|日历天|日|\(日历天\)|\(天\)|周内|,日历天|' \
- ')'
- reg = re.compile(before + charac + before2 + center + after)
- reg1 = re.compile(before + charac + '(.*?止)')
- reg_not = re.compile(u'(工期延误|工期节点|工期管理|交付使用'
- u'|工期、)'
- u'|工期情况|划工期内|服务期内')
- reg_not1 = re.compile(u'(履行日期:见|服务期限应按|签订合同前,|服务期限应按'
- u'|务期限:1、|同签订日期:|证金在合同签|服务期限截止'
- u')')
- reg_not2 = re.compile(u'截止|1\.|1、')
- for index in range(len(text_list)):
- # 初始化
- output_str = ""
- input_str = text_list[index]
- # 替换混淆词
- input_str = re.sub(reg_not, "####", input_str)
- input_str = re.sub(reg_not1, "######", input_str)
- input_str = re.sub(reg_not2, "##", input_str)
- output_str, text_index = re_findAllResult(reg, input_str)
- if len(text_index) == 0:
- output_str, text_index = re_findAllResult(reg1, input_str)
- # 添加
- output_list.append(output_str)
- text_index_list.append(str(text_index))
- df["text_index"] = pd.DataFrame(text_index_list)
- index_to_word = []
- for index, row in df.iterrows():
- i_list = ast.literal_eval(row["text_index"])
- word = ""
- for i in range(len(i_list)):
- word = word + row["text"][i_list[i][0]:i_list[i][1]]
- if i != len(i_list) - 1:
- word = word + " "
- if len(word) >= 120:
- word = ""
- df["text_index"].iloc[index] = []
- index_to_word.append(word)
- df["re"] = pd.DataFrame(index_to_word)
- df = df[["document_id", "text", "categoryAndIndex", "word", "category", "re", "text_index"]]
- df.to_csv("C:\\Users\\admin\\Desktop\\serviceTime_text4.csv")
- def re_findAllResult(reg, input, unit="", index=0):
- '''
- :param reg: 正则表达式
- :param input: 待匹配句子
- :param unit: 需要加的单位
- :param index: 字符串拼接的开始位置
- :return: 正则后的字符串
- '''
- match = re.findall(reg, input)
- output = ""
- if match:
- ss = ""
- for i in range(len(match)):
- s = ""
- for j in range(index, len(match[i])):
- s = s + match[i][j]
- if unit != "" and j == len(match[i])-1:
- s = s + unit
- ss = ss + s
- if i < len(match)-1:
- ss = ss + " "
- output = ss
- # 全文下标
- text_index = []
- match1 = re.finditer(reg, input)
- for i in match1:
- d = i.groupdict()
- print(d)
- if d.get("before") is not None:
- front_len = len(d.get("before")) + len(d.get("charac"))
- else:
- front_len = 0
- text_index.append([i.start()+front_len, i.end()])
- return output, text_index
- def calculateLen(ss, i):
- front_len = 0
- back_len = 0
- print("------")
- print(i)
- print(ss)
- for index in range(i):
- print(ss[index], len(ss[index]))
- front_len += len(ss[index])
- for index in range(i+1, len(ss)):
- back_len += len(ss[index])
- return front_len, back_len
- def test_re():
- keyword = u'(' \
- u'工期/交货期/服务期|项目周期|工期\(交货期\)|计划工期|工期要求:|服务期|服务时限|履行期限|服务周期|供货期|合格工期' \
- u'|投标工期|设计工期|合格服务周期|总工期|服务时间|流转期限|维护期限|服务时限|交货期' \
- u'|完成时间|服务期限|中标工期|项目周期|期限要求|周期|工期:' \
- u')'
- reg0 = re.compile(u'(服务时间:|服务期限:)'
- u'([^至到]*)'
- u'(至|到)'
- u'([^日时]*)'
- u'(日|时)'
- )
- reg = re.compile(u'(周期|工期|服务期|服务时限|履行期限|服务周期|交货期|供货期|合格工期'
- u'|投标工期|设计工期|合格服务周期|总工期|施工工期|服务时间|流转期限|维护期限'
- u'|完成时间|交付|服务期限|中标工期|项目周期|计划工期'
- u')'
- # u'([^日止|日内|年|年度|月|日|周内|年内|\d+]*)'
- u'([^年月日\d+]*)'
- u'([\d+|一|二|三|四|五|六|七|八|九|十])'
- u'(日止|日内|年|年度|月|日|周内|年内|日历天|工作日|\d+日|\d+|起*)'
- u'(个月|\(日历天\)|)')
- reg1 = re.compile(u'(工期/交货期/服务期:|服务期限|服务期|工期,|工期要求|中介服务时限)'
- u'([^天年月日]*[\d+一二三四五六七两叁贰壹肆伍])'
- u'(天|个月|个日历天|年|日历天|日|\(日历天\)|\(天\))')
- reg2 = re.compile(u'(服务期限|履行期限|工期|服务期|维护期限|服务周期|工期,\(日历天\),)'
- u'(:|:)+'
- u'(\d+|一|二|三|四|五|六|七|八|九|十|两|贰|叁)'
- u'(日止|日内|年|年度|月|日历天|日|周内|年内|工作日)'
- )
- s = u'(项目周期|周期|工期/交货期/服务期|服务期|服务时限|履行期限|服务周期|交货期|供货期|合格工期' \
- u'|投标工期|设计工期|合格服务周期|总工期|施工工期|服务时间|流转期限|维护期限' \
- u'|完成时间|交付|服务期限|中标工期|项目周期|计划工期)'
- reg3 = re.compile(s +
- u'([^天年月日]*)'
- u'([\d+一二三四五六七两叁贰壹肆伍])'
- u'(,|)'
- u'(天|个月|年|日历天|日|\(日历天\)|\(天\))')
- reg_00 = re.compile(u'(服务期限|工期|服务时间)'
- u'([^至]*)'
- u'(至)'
- u'([^止]*)'
- u'(止)')
- reg_01 = re.compile(u'(服务期限|工期|服务时间)'
- u'([^至]*)'
- u'(至)'
- u'([^日]*)'
- u'(日)')
- reg4 = re.compile(keyword +
- u'([^天年月日]*)'
- u'([\d+一二三四五六七两叁贰壹肆伍])'
- u'(,|)'
- u'(天|个月|年|个日历天|日历天|日|\(日历天\)|\(天\))')
- reg5 = re.compile(u'(服务要求|服务时限)'
- u'([^年日]*)'
- u'(年|日)')
- test_text0 = "保险服务期限:自2020年1月1日零时起至2021年12月31日24时止的自然年度" \
- " 服务时间:2020年05月25日至2020年08月08日"
- test_text = ",中标候选人公示快照。北京北方车辆集团有限公司原试验工段改扩建工程中标候选人公示,原试验工段改扩建工程,(招标项目编号:C1100000096007025006),于2020年05月21日在北京市市辖区西城区西便门内大街79号4号楼409进行了开标、评标等工作,现将本次评标结果推荐中标候选人公示如下:" \
- "标段(包)编号:C1100000096007025006001,标段(包)名称:原试验工段改扩建工程,第一名:北京永兴丰源建筑工程有限公司,投标报价(元):2,010,700.02,质量标准:合格工期(天):90,项目负责人姓名:周秋红相关证书名称:二级建造师相关证书编号:京211141545754,建筑工程施工总承包壹级,建筑装修装饰工程专业承包贰级," \
- "钢结构工程专业承包叁级,第二名:北京市九方弘业建筑工程有限责任公司,投标报价(元):1,988,322.19,质量标准:合格工期(天):90,项目负责人姓名:任敬科相关证书名称:二级建造师相关证书编号:01453994,建筑工程施工总承包叁级,钢结构工程专业承包叁级,第三名:河南德恒建设工程有限公司,投标报价(元):1,996,228.17,质量" \
- "标准:合格工期(天):90,项目负责人姓名:张献军相关证书名称:二级建造师相关证书编号:豫241141449543,建筑工程施工总承包贰级,公示期:2020年05月26日-2020年05月28日,特此公示!,对评标结果如有异议,请于2020年05月28日前在中国兵器电子招标投标交易平台上进行提出。联系人:" \
- "李茜,联系电话:13910533516,北京五环国际工程管理有限公司,2020年05月25日,"
- test_text1 = "服务时间:合同签订之日起90日历天,联系电话:13910533516,北京五环国际工程管理有限公司,2020年05月25日"
- test_text2 = "服务期限:两日 服务要求:1年 服务时限:中选后15个工作日完成"
- test_text3 = "工期/交货期/服务期:30天 标准:合格工期(天) 服务期限:两年。具体采购内容和要求详见招标文件年 项目周期:40日历天"
- test_text4 = u'''
- ,大庆禾工煤炭分质清洁利用项目-临时用电二期工程设备、物资采购中标候选人公示,更多咨询报价请点击:,大庆禾工煤炭分质清洁利用顶目-临时用电二期工程设备、物资釆购中标候选人,(招标编号:XYwZ-20200309-5),公示结束时间:2020年04月03日,、评标情况,标段(包)[001大庆禾工煤嶽分质清洁利用项目-临时用屯二期工程设备、物资采购,中标候选人基本情况,
- 中标候选人第1名:哈尔滨龙网电力设备有限公司,投标报价:19.98万元,质量,合格,工期/交货期/服务期:30天,中标候选人第2名:
- 哈尔滨昊龙电气没备制造有限公司,投标报价:19.87万元,质,量:合格,工期/交货期/服务期:30天,
- 中标侯选人第3名:江南电气有限公司,投标报价:20.13万元,质量:合格,工期,交货期/服务期:30天:2、中标候选人按照招标文件要求承诘的项目伉责人情况,中标侯选人(哈尔滨龙网电力设备有限公司)的项目负贵人:宋环宇身份证,10398309240912;,中标候选人(哈尔滨昊龙电气设各制造有限公司)的项目负贵人:尹水生身份证,2:0227197902120112,中标候选人(江南电气有限公司)的项目负贵人:秦世亮身份证,230104197410012334;,3、中标候选人响应招标文
- 件要求的资格能力条件,中标候选人(哈尔滨龙网电力设备有限公司)的资格能力条件:完全响应招标公告;中标选人(哈尔滨昊龙电气没备制造有公司)的资格能力条件:完伞响应招标公,告,中标候选人(江南电气有限公司)的资格能力条件:完仝响应招标公告,、提出异议的渠道和方式,以上结果公示三日,公示期间投标人或者其他利害关系人如有异议请以书面形式向招标,人提出;如无异议,预中标人即为中标人。三、其他,项目编号:-20200309-5,项目名称:大庆禾工煤炭分质清
- 沽划用项目-临时电二期工程设备、物资采购,计划供货期:合同签订后30日内供货,交货地点:施工现场地面交货,质量标准:符合国家及国家电网行业合格标准,招邡方式:公开招标,开标时间:2020华3月3日9时30分,公示起止日期:2020年4月1日至2020年±月3日,经评标委员会评审,中标候选人由高到低排序前三名为:第一名:晗尔滨龙网电力设备有限公司,第二名:晗尔滨昊龙电气设备制造有限公司,第三名:江南电气有限公司,点标有,经评标委员会评审,依法确定排名第一的
- 中标候选人为预中标人。预中标人为:晗尔滨龙网电力设备有限公司,颀中标价:¥199,800.00元,以上结果公示三日,公小期间投标人或者其他利害关系人如有异议请以书面形式向招标入提,出;如无异议,预中标人即为中标人。监督部门及联系方式:黑龙江北星电力有跟公罰、0459-6504811,四、监督部门,本招标项目的监督部门为黑龙江北星电力有限公司。五、联系方式,招标人:黑龙江北星电力有限公司,地址:大庆市让胡路区中买大街南段28号,联系人:卜先生,电话:0459-
- 6604811,电子邮件:418864qgq.com,招标代理机构:黑龙江省信亿招标有限公司,地址:哈尔滨市香坊区红滨大街1号516室,联系人:张海洋,电话;0451-55151625,电子邮件:xyzb5164163.com,招标人或其招标代理机构主要负责人(项目负贲人,(签名),1,招标人或其招标代理机构:与,盖章),
- '''
- s = re.finditer(reg4, test_text4)
- # s = re.sub(reg5, "", test_text2)
- # print(s)
- # print(s.span())
- # s = re.match("交货期/服务期:", "交货期/服务期:365天")
- # print(s.span())
- # if s:
- # print(s)
- # print("计划工期:3个月 工期:3个月".split(" "))
- for ss in s:
- # sss = (0, 0)
- print(ss.group())
- print(ss.span())
- # print(sss[1])
- def re_Accuracy(filename):
- df = pd.read_csv("C:\\Users\\admin\\Desktop\\"+filename+".csv")
- flag = []
- flag_1 = 0
- flag_0 = 0
- for index, row in df.iterrows():
- if row["word"] == row["re"]:
- flag.append(1)
- flag_1 += 1
- elif str(row["re"]) in row["word"] or row["word"] in str(row["re"]):
- flag.append(1)
- flag_1 += 1
- else:
- flag.append(0)
- flag_0 += 1
- print("Accuracy: ", flag_1/(flag_1+flag_0))
- df["correct"] = flag
- df.to_csv("C:\\Users\\admin\\Desktop\\"+filename+".csv")
- def getTestData():
- df = pd.read_csv("C:\\Users\\admin\\Desktop\\Person_Sentence_all_re_washed2.csv")
- number0 = 500
- number1 = 1500
- number2 = 600
- number3 = 600
- number4 = 500
- df0 = df[df["Label"] == 0][:number0]
- df0_deleted = df[df["Label"] == 0][number0:]
- df1 = df[df["Label"] == 1][:number1]
- df1_deleted = df[df["Label"] == 1][number1:]
- df2 = df[df["Label"] == 2][:number2]
- df2_deleted = df[df["Label"] == 2][number2:]
- df3 = df[df["Label"] == 3][:number3]
- df3_deleted = df[df["Label"] == 3][number3:]
- df4 = df[df["Label"] == 4][:number4]
- df4_deleted = df[df["Label"] == 4][number4:]
- df_test = pd.concat([df0, df1, df2, df3, df4])
- df_deleted = pd.concat([df0_deleted, df1_deleted, df2_deleted, df3_deleted, df4_deleted])
- df_test.columns = ["index", "Word", "Label", "Sentence", "BIO"]
- df_test = df_test.reset_index()
- df_test = df_test[["Word", "Label", "Sentence", "BIO"]]
- df_deleted.columns = ["index", "Word", "Label", "Sentence", "BIO"]
- df_deleted = df_deleted.reset_index()
- df_deleted = df_deleted[["Word", "Label", "Sentence", "BIO"]]
- df_test.to_csv("C:\\Users\\admin\\Desktop\\test2000.csv")
- df_deleted.to_csv("C:\\Users\\admin\\Desktop\\Person_Sentence_Notest.csv")
- def washData():
- df = pd.read_csv("C:\\Users\\admin\\Desktop\\Person_Sentence_all_re_washed2.csv")
- # Agency_person
- # reg1 = re.compile(u'(代理 机构 :|代理 机构 名称 :|代理 机构 联系 方式 :|采购 代理 机构|询价 代理 :'
- # u'|代理 公司 :|招标 代理 :).*(联系人).*(\|\|)')
- # # reg1 = re.compile(u'(代理 机构 联系 方式 :|招标 代理 单位 :|代理 机构 名称 :|采购 代理 机构 信息'
- # # u'|交易 代理 机构).*(\|\|)')
- # reg2 = re.compile(u'招标 人|质疑|采购|报名|监督|发布|出售|技术|中标|项目 联系人|项目 负责人'
- # u'|招标 联系人|招标 单位 联系人')
- # reg3 = re.compile(u'地址。*地址')
- # Tenderee_person
- # reg1 = re.compile(u'(采购 人 :|招标 人 :|采购 单位 :|采购 单位 名称 :|采购 人 名称 :|采购 单位 联系 方式 :'
- # u').*(联系人|联系 方式 :).*(\|\|)')
- # reg1 = re.compile(u'(招标 联系人 :|招标 人 联系 方式|招标 联系人 及 地址 :|招标 人 联系人 :'
- # u'|招标 单位 :|采购 人 信息|采购 人 名称 :|采购 单位 联系人 :|采购 人 联系人).*(\|\|)')
- # reg1 = re.compile(u'(技术 部分|商务 部分).*(\|\|)')
- # reg2 = re.compile(u'代理|质疑|供应商|法人|发布|监督|项目|技术|投诉|服务 中心|文件 编制|部门|组织')
- # # 表格型的数据被压成一行,前后分别为招标联系人、代理联系人
- # reg3 = re.compile(u'(联系人 :).*(联系人 :)')
- # 评审专家
- # reg1 = re.compile(u'(评审 专家 :|评审 专家 名单|专家 名单 :|专家 :|评审 委员会 成员 名单 :'
- # u'|评委 姓名 :|评审 委员会).*(\|\|)')
- # reg2 = re.compile(u'招标 人|质疑')
- # person_person
- # reg1 = re.compile(u'(项目 联系人 :|监督 管理 部门|出让 人 :|监督 :|中标 单位 :|竞价 开启|质疑|商务 咨询 :|项目 名称 :'
- # u'|招标 管理 办公室|负责人 姓名 :|技术 负责人|项目 负责人|法定 代表人|发布人 :|招标 人员 :'
- # u'|项目 负责人 联系 电话 :|项目 经理 :|代理 人员 :|商务 联系人|法人|咨询 电话 :|投诉 电话 :'
- # u'|受理人 :|收件人 :|联络人 :|项目 咨询 联系人 :|项目 报名 联系 :|收货人 :|交易 单位 :'
- # u'|质疑 答复 联系人 :|现场 联系人|项目 总监 :|质疑 联系人|联系 确认|标的 查看|接收人|联系人 :'
- # u'|技术 支持|项目 总工|审核 人|监理 工程师 :).*(\|\|)')
- # reg1 = re.compile(u'(项目 联系人 :|项目 单位 :|监督 管理 部门 名称 :|质疑 答复 联系人 :|成交 单位 :'
- # u'|项目 负责人|供应商 地址 :|机构 联络人 :|技术 负责人 :|采购 管理 机构 :'
- # u'|项目 联系人).*(\|\|)')
- # reg1 = re.compile(u'(项目 单位 :|招标 服务 中心|采购 管理 办公室|项目 名称 :|采购 管理 机构 :'
- # u'|发包 单位 :).*(联系人).*(\|\|)')
- # reg1 = re.compile(u'(招标 组织 单位 :|审核 人 :|采管 办 联系 方式 :|采购 项目 联系 方式'
- # u'|询价 书 名称|疑问|资格 审查|提出|采购 文件|公众 号|项目 联系人 :|技术 负责人'
- # u'|发布 人 :|联系 确认).*(\|\|)')
- # reg1 = re.compile(u'(法定 代表人 :|委托 代理人 :).*(\|\|)')
- # reg1 = re.compile(u'(备注 :).*(\|\|)')
- # reg2 = re.compile(u'磋商|编 制|公证|审核|谈判|评委|代理 机构 名称|代理 机构'
- # u'|采购 人 :|招标 人|采购 单位|采购 单位 名称 :|采购 人 名称 :|采购 单位 联系 方式 :|招标 单位 :'
- # u'|采购 人|招标 代理|从业|施工员|资料员|公证员|受让方|采购员|招标 单位|招标 联系人|釆购 单位'
- # u'|姓名|习近平|开户 名称')
- # reg1 = re.compile(u'(联系人 :).*(联系人 :).*(\|\|)')
- reg1 = re.compile(u'(联系人 :).*(\|\|).*(联系人 :)')
- reg2 = re.compile(u'代理|公司|地址|采购|电话|商务|招标|技术|项目|联系 方式|监督')
- # person
- # reg1 = re.compile(u'(备注 :|受让方|受让 单位 :|从业 人员 :|姓名 :|施工员|资料员|公证员 :|采购员 :|开户 名称).*(\|\|)')
- # reg1 = re.compile(u'(安全员|施工员|材料员|质量员|质量检查员|质检员|造价员|资料员).*(\|\|)')
- # reg2 = re.compile(u'招标|项目|负责')
- ## 从其他类筛选出该类
- # 查看筛选出的数据
- # df = df[df["Label"] == 2]
- # wash_list = []
- # for index, row in df.iterrows():
- # match = reg1.search(row["Sentence"])
- # if match:
- # match2 = reg2.search(match.group())
- # # if not match2:
- # if not match2:
- # wash_list.append(row)
- # df1 = pd.DataFrame(wash_list)
- # df1.to_csv("C:\\Users\\admin\\Desktop\\Person_Sentence_all_temp.csv")
- # 改标签
- for index, row in df.iterrows():
- if row["Label"] == 2:
- match = reg1.search(row["Sentence"])
- if match:
- match2 = reg2.search(match.group())
- if not match2:
- # row["Label"] = 3
- df["Label"].iloc[index] = 1
- df = df[["Word", "Label", "Sentence", "BIO"]]
- df.to_csv("C:\\Users\\admin\\Desktop\\Person_Sentence_all_re_washed2.csv")
- ## 从该类筛选出不属于该类的
- # 查看筛选出的数据
- # df = df[df["Label"] == 1]
- # wash_list = []
- # for index, row in df.iterrows():
- # match = reg1.search(row["Sentence"])
- # if match:
- # match2 = reg2.search(match.group())
- # # if not match2:
- # if not match2:
- # # match3 = reg3.search(match.group())
- # match3 = reg3.search(row["Sentence"])
- # if not match3:
- # wash_list.append(row)
- # df1 = pd.DataFrame(wash_list)
- # df1.to_csv("C:\\Users\\admin\\Desktop\\Person_Sentence_all_temp.csv")
- # 改标签
- # for index, row in df.iterrows():
- # if row["Label"] == 1:
- # match = reg1.search(row["Sentence"])
- # if match:
- # match2 = reg2.search(match.group())
- # if not match2:
- # # match3 = reg3.search(match.group())
- # match3 = reg3.search(row["Sentence"])
- # if not match3:
- # df["Label"].iloc[index] = 3
- #
- # df = df[["Word", "Label", "Sentence", "BIO"]]
- # df.to_csv("C:\\Users\\admin\\Desktop\\Person_Sentence_all_re_washed2.csv")
- def relabel():
- df = pd.read_csv("C:\\Users\\admin\\Desktop\\Person_Sentence_Notest.csv")
- # df = pd.read_csv("C:\\Users\\admin\\Desktop\\test2000.csv")
- df1 = df
- for index, row in df.iterrows():
- if row["Label"] == 1:
- df1["Label"][index] == 3
- if row["Label"] == 2:
- df1["Label"][index] == 1
- if row["Label"] == 3:
- df1["Label"][index] == 2
- df2 = df1
- for index, row in df1.iterrows():
- if row["Label"] == 1:
- ss = row["Sentence"].split("||")
- forward = ss[0][-30:]
- if "。 联系人" in forward or ", 联系人" in forward \
- or ", 联系 方式" in forward or "。 联系 方式" in forward:
- df2["Label"][index] = 3
- if row["Label"] == 2:
- ss = row["Sentence"].split("||")
- forward = ss[0][-30:]
- if "。 联系人" in forward or ", 联系人" in forward \
- or ", 联系 方式" in forward or "。 联系 方式" in forward:
- df2["Label"][index] = 3
- df2 = df2[["Word", "Label", "Sentence", "BIO"]]
- df2.to_csv("C:\\Users\\admin\\Desktop\\Person_Sentence_Notest_new.csv")
- # df2.to_csv("C:\\Users\\admin\\Desktop\\test2000_new.csv")
- def relabel2():
- df = pd.read_csv("C:\\Users\\admin\\Desktop\\Person_Sentence_Notest_new.csv")
- # df = pd.read_csv("C:\\Users\\admin\\Desktop\\test2000.csv")
- df1 = df
- for index, row in df1.iterrows():
- if row["Label"] == 3:
- ss = row["Sentence"].split("||")
- forward = ss[0][-20:]
- if "采购 " in forward and "窗口" not in forward and "公司" not in forward \
- and "窗口" not in forward and "文件" not in forward \
- and "质疑" not in forward and "中心" not in forward\
- and "处" not in forward:
- # if "招标 " in forward:
- print(forward)
- df1["Label"][index] = 1
- df1 = df1[["Word", "Label", "Sentence", "BIO"]]
- # print(df1)
- # df1.to_csv("C:\\Users\\admin\\Desktop\\Person_Sentence_Notest_new.csv")
- if __name__ == "__main__":
- # Postgre2Data()
- # data2BIOData()
- # BIOData2DataFrame()
- # start_time = time.time()
- # print("开始:", start_time)
- # PersonBIOData2BIO_Sentence()
- # end_time = time.time()
- # print("耗时:", end_time-start_time)
- # start_time = time.time()
- # print("开始:", start_time)
- # BIOData2PersonData()
- # end_time = time.time()
- # print("耗时:", end_time-start_time)
- # print(datetime.strptime("2018-02-02", '%Y-%m-%d'))
- # print(len("二、公示期:2020年05月25日至2020年06月03日,三、该宗地双方已签订成交确认书,在30日内签订出让合同,"
- # "相关事宜在合同中约定,四、联系方式,联系单位:惠州市公共资源交易中心仲恺分中心,单位地址:惠州仲恺高新区和畅五"
- # "路人才服务大厦10楼,邮政编码:联系电话:0752-3278419,联系人:"))
- # duplicateData(3, 0.5)
- # resetAndShuffleData()
- # start_time = time.time()
- # BIOData2TXT()
- # end_time = time.time()
- # print("耗时:", end_time-start_time)
- # TXT2BIOData()
- # BIOData2Bidway()
- # BIOData2ServiceTime()
- # Text2Csv()
- # Csv2ServiceTimeText()
- # Csv2BidwayText()
- # re_serviceTime()
- # re_bidway()
- # Postgre2Data()
- # getTestData()
- # washData()
- # re_serviceTime2()
- # re_Accuracy("serviceTime_text1")
- # test_re()
- # re_serviceTime3()
- # relabel()
- relabel2()
|