当前位置 博文首页 > python文本处理的方案(结巴分词并去除符号)

    python文本处理的方案(结巴分词并去除符号)

    作者:依我去 时间:2021-08-13 18:48

    看代码吧~

    import re
    import jieba.analyse
    import codecs
    import pandas as pd
    def simplification_text(xianbingshi):
        """提取文本"""
        xianbingshi_simplification = []
        with codecs.open(xianbingshi,'r','utf8') as f:
            for line in f :
                line = line.strip()
                line_write = re.findall('(?<=\<b\>).*?(?=\<e\>)',line)
                for line in line_write:
                    xianbingshi_simplification.append(line)
        with codecs.open(r'C:\Users\Administrator.SC-201812211013\PycharmProjects\untitled29\yiwoqu\code\xianbingshi_write.txt','w','utf8') as f:
            for line in xianbingshi_simplification:
                f.write(line + '\n')
    def jieba_text():
        """"""
        word_list = []
        data = open(r"C:\Users\Administrator.SC-201812211013\PycharmProjects\untitled29\xianbingshi_write.txt", encoding='utf-8').read()
        seg_list = jieba.cut(data, cut_all=False)  # 精确模式
        for i in seg_list:
            word_list.append(i.strip())
        data_quchong = pd.DataFrame({'a':word_list})
        data_quchong.drop_duplicates(subset=['a'],keep='first',inplace=True)
        word_list = data_quchong['a'].tolist()
        with codecs.open('word.txt','w','utf8')as w:
            for line in word_list:
                w.write(line + '\n')
    def word_messy(word):
        """词语提炼"""
        word_sub_list = []
        with codecs.open(word,'r','utf8') as f:
            for line in f:
                line_sub = re.sub("^[1-9]\d*\.\d*|^[A-Za-z0-9]+$|^[0-9]*$|^(-?\d+)(\.\d+)?$|^[A-Za-z0-9]{4,40}.*?",'',line)
                word_sub_list.append(line_sub)
        word_sub_list.sort()
        with codecs.open('word.txt','w','utf8')as w:
            for line in word_sub_list:
                w.write(line.strip("\n") + '\n')
    if __name__ == '__main__':
        xianbingshi = r'C:\Users\Administrator.SC-201812211013\PycharmProjects\untitled29\yiwoqu\xianbingshi_sub_sen_all(1).txt'
        # simplification_text(xianbingshi)
        # word = r'C:\Users\Administrator.SC-201812211013\PycharmProjects\untitled29\word.txt'
        simplification_text(xianbingshi)

    补充:python 进行结巴分词 并且用re去掉符号

    看代码吧~

    # 把停用词做成字典
    stopwords = {}
    fstop = open('stop_words.txt', 'r',encoding='utf-8',errors='ingnore')
    for eachWord in fstop:
        stopwords[eachWord.strip()] = eachWord.strip()  #停用词典
    fstop.close()
    f1=open('all.txt','r',encoding='utf-8',errors='ignore')
    f2=open('allutf11.txt','w',encoding='utf-8')
    line=f1.readline()
    while line:
        line = line.strip()  #去前后的空格
        line = re.sub(r"[0-9\s+\.\!\/_,$%^*()?;;:-【】+\"\']+|[+——!,;:。?、~@#¥%……&*()]+", " ", line) #去标点符号
        seg_list=jieba.cut(line,cut_all=False)  #结巴分词
        outStr=""
        for word in seg_list:
            if word not in stopwords:
                outStr+=word
                outStr+=" "
        f2.write(outStr)
        line=f1.readline()
    f1.close()
    f2.close()

    以上为个人经验,希望能给大家一个参考,也希望大家多多支持站长博客。

    jsjbwy
    下一篇:没有了