当前位置 博文首页 > AxeChen的博客:实体消歧(链接到实体库)

    AxeChen的博客:实体消歧(链接到实体库)

    作者:[db:作者] 时间:2021-09-11 16:56

    disambiguation.py

    #!/usr/bin/python3
    import pymysql
    import json
    import requests
    from SimilarityEN import similarity
    
    
    def findCandidates(entity):
        # 打开数据库连接
        db = pymysql.connect(host=host, port=3306,
                           user=username, passwd=password, db=dbname)
        # 使用 cursor() 方法创建一个游标对象 cursor
        cursor = db.cursor()
        # SQL 查询语句
        sql = "SELECT * FROM t_wikidata_human_content WHERE `name` LIKE " + "'" + "%" + entity + "'"
        cursor.execute(sql)
        persons = cursor.fetchall()
        candidates = []
        for person in persons:
            candidates.append({'wiki_id': person[1], 'name': person[2], 'description': person[3]})
        return candidates
    
    
    def getEntity(query):
        url = 'http://ip:8018/getNer'
        properties = {'text': query, 'lang': 'en'}
        resp = requests.get(url, params=properties).json()
        entities = []
        for entity in resp:
            if entity['ner'] == "PERSON":
                entities.append(entity)
        return entities
    
    
    def match(query):
        entities = getEntity(query)
        points = []
        for entity in entities:     #单一实体消歧
            candidates = findCandidates(entity['word'])
            texts = []
            for candidate in candidates:
                texts.append(candidate['description'])
            try:
                indent = similarity(texts, query)
                sort = sorted(enumerate(indent), key=lambda x: x[1])   ##b[-1][0] 最大值的原下标
                max_index = sort[-1][0]
            except:
                max_index = 0
            try:
                points.append({'wiki_id': candidates[max_index]['wiki_id'], 'name': candidates[max_index]['name'],
                          'begin': entity['begin'], 'end': entity['end']})
            except:
                points.append({'wiki_id': -1,
                               'begin': entity['begin'], 'end': entity['end']})
        points = json.dumps(points, indent=4)
        return points
    

    similarity.py

    # -*- coding:utf-8 -*-
    
    import codecs
    import re
    from gensim import corpora, models, similarities
    from nltk.tokenize import WordPunctTokenizer
    
    
    def wordtokenizer(sentence):
        words = WordPunctTokenizer().tokenize(sentence)
        return words
    
    
    def tokenization(text, stopwordpath):
        stop_words = stopwordpath
        stopwords = codecs.open(stop_words, 'r', encoding='utf8').readlines()
        stopwords = [w.strip() for w in stopwords]
        result = []
        text = re.sub("[-',{:+}|.()/?!·;]", ' ', text).lower()
        words = wordtokenizer(text)
        for word in words:
            if word not in stopwords:
                result.append(word)
        return result
    
    
    def similarity(texts, query, stopwordpath='stop.txt'):
        corpus = []
        for text in texts:
            corpus.append(tokenization(text, stopwordpath))
        dictionary = corpora.Dictionary(corpus)     # 生成特征字典,为每个出现在语料库中的单词分配了一个独一无二的整数编号id
        doc_bow = [dictionary.doc2bow(text) for text in corpus]     # 函数doc2bow() 简单地对每个不同单词的出现次数进行了计数,并将单词转换为其编号,然后以稀疏向量的形式返回结果。
    
        tfidf = models.TfidfModel(doc_bow)      # 每一个特征的IDF值的统计
        tfidf_bow = tfidf[doc_bow]      # 计算tfidf
        query = tokenization(query, stopwordpath)
        query_bow = dictionary.doc2bow(query)
        index = similarities.MatrixSimilarity(tfidf_bow)
        sims = index[query_bow]
        return sims
    
    cs
    下一篇:没有了