Python 最大概率法進(jìn)行漢語切分的方法

更新時(shí)間：2018年12月14日 16:04:30 作者：qijingpei

今天小編就為大家分享一篇Python 最大概率法進(jìn)行漢語切分的方法，具有很好的參考價(jià)值，希望對(duì)大家有所幫助。一起跟隨小編過來看看吧

要求：

1 采用基于語言模型的最大概率法進(jìn)行漢語切分。

2 切分算法中的語言模型可以采用n-gram語言模型，要求n >1，并至少采用一種平滑方法；

代碼：

廢話不說，代碼是最好的語言

import re
import math

MAX_SPLITLEN = 4#最大切分長度
corpus_lib = ''#corpus:語料


def init_corpus_lib(path): # 初始化語料庫
 global corpus_lib
 with open(path, 'r', encoding='utf-8', errors='ignore') as file:
  corpus_lib = str(file.readlines())


def get_candidate_words(sen):
 global MAX_SPLITLEN
 global corpus_lib
 candidate_words = []
 for sp in range(len(sen)):
  w = sen[sp]
  candidate_words.append([w, sp, sp]) # 有些字可能不在語料庫中，把它作為單個(gè)字加進(jìn)去
  for mp in range(1, MAX_SPLITLEN): # 判斷1 ~ MAX_SPLITLEN-1這3種詞中是否有候選詞.
   if sp + mp < len(sen):
    w += sen[sp + mp]
    if w in corpus_lib:
     candidate_words.append([w, sp, sp + mp]) # 存儲(chǔ)詞，初始位置，結(jié)束位置
 print('候選詞有：%s' % candidate_words)
 return candidate_words


def segment_sentence(sen): # sen:sentence即要切分的句子
 global MAX_SPLITLEN
 global corpus_lib

 candidate_words = get_candidate_words(sen)
 count = 0
 for word in candidate_words:
  if count > 1000: # 為防止對(duì)長句子解析時(shí)間過長，放棄一部分精度追求效率
   break
  if word[1] == 0 and word[2] != len(sen) - 1: # 如果句子中開頭的部分，還沒有拼湊成整個(gè)詞序列的話
   no_whitespace_sen = ''.join(word[0].split())
   for word in candidate_words: # word比如：['今天', 1, 2]，1是今在句子中的位置，2是天的位置
    if word[1] == 0 and word[2] != len(sen) - 1:
     end = word[2]
     for later_word in candidate_words:
      if later_word[1] == end + 1: # 如果later_word是當(dāng)前詞的后續(xù)詞，那么拼接到當(dāng)前詞上
       word_seq = [word[0] + ' ' + later_word[0], word[1], later_word[2]] # 合并
       candidate_words.append(word_seq)
       # print('拼出了新詞：%s' % word_seq)
       count += 1
     candidate_words.remove(word) # 遍歷完后，這個(gè)開頭部分短語要移除掉，不然下次遍歷還會(huì)對(duì)它做無用功
 print('所有結(jié)果詞序列有：%s' % candidate_words)

 word_segment_res_list = [] # 存儲(chǔ)分詞結(jié)果序列
 for seque in candidate_words:
  if seque[1] == 0 and seque[2] == len(sen) - 1:
   word_segment_res_list.append(seque[0])
 print('獲得的所有分詞結(jié)果是：')
 print(word_segment_res_list)
 return word_segment_res_list


# P(w1,w2,...,wn) = P(w1/start)P(w2/w1)P(w3/w2).....P(Wn/Wn-1)
# 下標(biāo)從0開始： = P(w0/start)P(w1/w0)...P(Wn-1/Wn-2)
def calculate_word_sequence_probability(sequence):
 global corpus_lib
 word_list = sequence.split(' ')
 total_word_num = len(corpus_lib)
 prob_total = 0.0
 word_start = word_list[0]
 # 計(jì)算第一個(gè)詞出現(xiàn)的概率P(w1/start)=Count(w1)/total
 count = len(re.findall(r'\s' + word_start + r'\s', corpus_lib)) + 1 # 加1平滑
 prob_total += math.log(count / total_word_num)
 # 計(jì)算P(w2/w1)P(w3/w2).....P(Wn/Wn-1)
 for i in range(len(word_list) - 1): # 0~ n-2
  prev_w = word_list[i]
  later_w = word_list[i + 1]
  count = len(re.findall(r'\s' + prev_w + r'\s' + later_w + r'\s', corpus_lib))
  count += 1 # 做一次加1平滑
  prob_total += math.log(count / total_word_num)
 print('%s的概率是：' % sequence)
 print(prob_total)
 return prob_total


def calculate_biggest_prob(word_segm_res):
 best_w_s = ''
 max_prob = 0.0
 for w_s in word_segm_res: # 改進(jìn)：先只計(jì)算詞的數(shù)目<=0.6 句子字?jǐn)?shù)的，如果不行再計(jì)算全部的概率
  no_whitespace_sen = ''.join(w_s.split())
  zi_shu = len(no_whitespace_sen)
  if len(w_s.split(' ')) <= zi_shu * 0.6:
   prob = calculate_word_sequence_probability(w_s)
   if max_prob == 0 or max_prob < prob:
    best_w_s = w_s
    max_prob = prob
  if best_w_s == '': # 如果上面的0.6不行的話，再計(jì)算全部的概率
   prob = calculate_word_sequence_probability(w_s)
   if max_prob == 0 or max_prob < prob:
    best_w_s = w_s
    max_prob = prob
 print('最好的分詞結(jié)果（概率為%s）是 ：%s' % (math.pow(math.e, max_prob), best_w_s))
 return best_w_s


def split_middle(sen_to_segment): # 從中間切分一下，返回中間切分的位置
 length = len(sen_to_segment)
 start = int(length / 2) - 2
 end = start + 5
 # 對(duì)中間的5個(gè)字進(jìn)行切分，然后找第一個(gè)空格，按此把整個(gè)句子一分為二
 middle_part = sen_to_segment[start:end]
 best_segm_res = calculate_biggest_prob(segment_sentence(middle_part))
 return start + best_segm_res.index(' ') - 1


def split_mark_and_too_long_sent(sentences): # 按任意標(biāo)點(diǎn)符號(hào)劃分句子，對(duì)每個(gè)短句進(jìn)行分詞
 sen_list = sentences.splitlines()
 print(sen_list)

 out_text = ''
 for line in sen_list:
  sen_to_segment = '' #
  for single_char in line:
   if single_char.isalpha(): # isalpha()表示是否是單詞，如果是單詞的為True，標(biāo)點(diǎn)符號(hào)等為False
    sen_to_segment += single_char
   elif not single_char.isalpha() and sen_to_segment == '': # 如果single_char是標(biāo)點(diǎn)符號(hào)、數(shù)字,且前面沒有待分詞的句子
    out_text += single_char + ' '
    print(single_char)

   else: # 如果single_char是標(biāo)點(diǎn)符號(hào)、數(shù)字,
    # 如果句子太長，先從中間切分一下
    if len(sen_to_segment) >= 20:
     middle = split_middle(sen_to_segment)
     left_half = sen_to_segment[0:middle + 1] # 左半部分
     best_segm_res = calculate_biggest_prob(segment_sentence(left_half))
     out_text += best_segm_res + ' '
     sen_to_segment = sen_to_segment[middle + 1:len(sen_to_segment)] # 右半部分交給后面幾行處理

    best_segm_res = calculate_biggest_prob(segment_sentence(sen_to_segment))
    print(single_char)
    sen_to_segment = ''
    out_text += best_segm_res + ' ' + single_char + ' ' # 標(biāo)點(diǎn)兩側(cè)也用空格隔起來

  # 如果這行句子最后還有一些文字沒有切分的話
  if sen_to_segment != '':
   best_segm_res = calculate_biggest_prob(segment_sentence(sen_to_segment))
   out_text += best_segm_res + ' '
  out_text += '\n'

 with open('D:/1佩王的文件/計(jì)算語言學(xué)基礎(chǔ)/生成結(jié)果.txt','w') as file:
  file.write(out_text)
 print(out_text)


if __name__ == '__main__':
 path = 'D:/1佩王的文件/計(jì)算語言學(xué)基礎(chǔ)/北大(人民日?qǐng)?bào))語料庫199801.txt'
 init_corpus_lib(path)#初始化語料庫

 sentences = ''
 path = 'E:/study/1.研一的課/計(jì)算語言學(xué)基礎(chǔ)課件/testset.txt'#讀取要切分的文章
 with open(path, 'r', encoding='gbk', errors='ignore') as file:
  for line in file.readlines():
   sentences += line

 # 改進(jìn)：先對(duì)句子按標(biāo)點(diǎn)符號(hào)劃分成多個(gè)短句，然后對(duì)每個(gè)短句進(jìn)行切分、計(jì)算概率
 split_mark_and_too_long_sent(sentences)

實(shí)現(xiàn)思路

1、處理語料庫

用的是人民日?qǐng)?bào)語料庫，然后為了方便把屬性去掉了，只留下了詞。

2、讀要分詞的文本，按照標(biāo)點(diǎn)符號(hào)、數(shù)字進(jìn)行分割

按標(biāo)點(diǎn)符號(hào)、數(shù)字進(jìn)行分割，確保分割結(jié)果是只有漢字的句子。如果句子過長(>=20)，則先對(duì)句子中間位置的5個(gè)字先切分一次，從5個(gè)字的切分結(jié)果的第一個(gè)空格處，把句子分成兩部分，再對(duì)每一部分分別切詞。標(biāo)點(diǎn)符號(hào)、數(shù)字則按照原樣輸出。

3、找出所有候選詞

從一個(gè)句子中找出所有的候選詞。如每次取4個(gè)字，假設(shè)為abcd這四個(gè)字，得到：a\b\c\d\ab\bc\cd\abc\bcd\abcd，判斷它們每個(gè)是否在語料庫中，如果是的話則存為候選詞。并存儲(chǔ)下這個(gè)詞在句子中的開始位置和結(jié)束位置。

4、計(jì)算出一個(gè)句子所有的切分結(jié)果

所有的候選詞放到了一個(gè)python的list（即集合）中，遍歷所有開始位置為0但結(jié)結(jié)束位不為0的候選詞，按照詞的開始位置和結(jié)束位置進(jìn)行拼湊，新拼湊出的元素會(huì)加入到這個(gè)list中。當(dāng)一個(gè)詞和其他所有能拼湊的詞拼湊完后，從list中刪除這個(gè)詞。當(dāng)遍歷結(jié)束后，集合中會(huì)有長度等于句子長度的元素，這些元素就是一個(gè)句子所有的切分結(jié)果。

4、使用2-gram模型計(jì)算出每種切分結(jié)果的概率，挑選出最大概率的句子切分結(jié)果

計(jì)算概率時(shí)使用條件概率，使用加一平滑。條件概率的公式為：P(w1,w2,…,wn) = P(w1/start)P(w2/w1)P(w3/w2)…..P(Wn/Wn-1)，利用log把乘法變成加法：log P(w1,w2,…,wn) = log P(w1/start) + logP(w2/w1) + ….. + logP(Wn/Wn-1)

句子往往不是由很多個(gè)單字組成的，所以為了提高速度，我們先計(jì)算出切分后詞個(gè)數(shù)<= 0.6 * 句子字?jǐn)?shù)的切分結(jié)果的概率，如果不為0則返回這個(gè)最大概率，如果為0的話，再計(jì)算 >= 0.6 的切分結(jié)果中的最大概率。

5、將擁有最大概率的句子切分結(jié)果存到文件中

以上這篇Python 最大概率法進(jìn)行漢語切分的方法就是小編分享給大家的全部內(nèi)容了，希望能給大家一個(gè)參考，也希望大家多多支持腳本之家。

您可能感興趣的文章: