当前位置 博文首页 > AxeChen的博客:实体消歧(链接到实体库)
disambiguation.py
#!/usr/bin/python3
import pymysql
import json
import requests
from SimilarityEN import similarity
def findCandidates(entity):
# 打开数据库连接
db = pymysql.connect(host=host, port=3306,
user=username, passwd=password, db=dbname)
# 使用 cursor() 方法创建一个游标对象 cursor
cursor = db.cursor()
# SQL 查询语句
sql = "SELECT * FROM t_wikidata_human_content WHERE `name` LIKE " + "'" + "%" + entity + "'"
cursor.execute(sql)
persons = cursor.fetchall()
candidates = []
for person in persons:
candidates.append({'wiki_id': person[1], 'name': person[2], 'description': person[3]})
return candidates
def getEntity(query):
url = 'http://ip:8018/getNer'
properties = {'text': query, 'lang': 'en'}
resp = requests.get(url, params=properties).json()
entities = []
for entity in resp:
if entity['ner'] == "PERSON":
entities.append(entity)
return entities
def match(query):
entities = getEntity(query)
points = []
for entity in entities: #单一实体消歧
candidates = findCandidates(entity['word'])
texts = []
for candidate in candidates:
texts.append(candidate['description'])
try:
indent = similarity(texts, query)
sort = sorted(enumerate(indent), key=lambda x: x[1]) ##b[-1][0] 最大值的原下标
max_index = sort[-1][0]
except:
max_index = 0
try:
points.append({'wiki_id': candidates[max_index]['wiki_id'], 'name': candidates[max_index]['name'],
'begin': entity['begin'], 'end': entity['end']})
except:
points.append({'wiki_id': -1,
'begin': entity['begin'], 'end': entity['end']})
points = json.dumps(points, indent=4)
return points
similarity.py
# -*- coding:utf-8 -*-
import codecs
import re
from gensim import corpora, models, similarities
from nltk.tokenize import WordPunctTokenizer
def wordtokenizer(sentence):
words = WordPunctTokenizer().tokenize(sentence)
return words
def tokenization(text, stopwordpath):
stop_words = stopwordpath
stopwords = codecs.open(stop_words, 'r', encoding='utf8').readlines()
stopwords = [w.strip() for w in stopwords]
result = []
text = re.sub("[-',{:+}|.()/?!·;]", ' ', text).lower()
words = wordtokenizer(text)
for word in words:
if word not in stopwords:
result.append(word)
return result
def similarity(texts, query, stopwordpath='stop.txt'):
corpus = []
for text in texts:
corpus.append(tokenization(text, stopwordpath))
dictionary = corpora.Dictionary(corpus) # 生成特征字典,为每个出现在语料库中的单词分配了一个独一无二的整数编号id
doc_bow = [dictionary.doc2bow(text) for text in corpus] # 函数doc2bow() 简单地对每个不同单词的出现次数进行了计数,并将单词转换为其编号,然后以稀疏向量的形式返回结果。
tfidf = models.TfidfModel(doc_bow) # 每一个特征的IDF值的统计
tfidf_bow = tfidf[doc_bow] # 计算tfidf
query = tokenization(query, stopwordpath)
query_bow = dictionary.doc2bow(query)
index = similarities.MatrixSimilarity(tfidf_bow)
sims = index[query_bow]
return sims
cs