使用Python編寫詞頻統(tǒng)計(jì)工具的示例代碼

更新時(shí)間：2025年06月18日 09:36:08 作者：晨曦543210

這篇文章主要為大家詳細(xì)介紹了如何使用Python編寫詞頻統(tǒng)計(jì)工具的相關(guān)知識(shí),文中的示例代碼講解詳細(xì),感興趣的小伙伴可以跟隨小編一起學(xué)習(xí)一下

一、程序工作流程

1.啟動(dòng)程序，顯示主菜單

2.用戶選擇加載文本的方式：

直接輸入文本
從文件加載

3.程序處理文本，統(tǒng)計(jì)詞頻

4.用戶可以選擇：

查看統(tǒng)計(jì)摘要
查詢特定單詞頻率
查看所有單詞頻率
導(dǎo)出結(jié)果到CSV
可視化展示常見單詞

5.用戶可以選擇退出程序

二、完善代碼

1. 導(dǎo)入庫

import re
from collections import Counter
import matplotlib.pyplot as plt

re: Python的正則表達(dá)式庫，用于文本處理

Counter: 來自collections模塊，用于高效計(jì)數(shù)

matplotlib.pyplot: 用于數(shù)據(jù)可視化

2. WordFrequencyAnalyzer類

這是程序的核心類，負(fù)責(zé)文本分析和統(tǒng)計(jì)：

初始化方法 __init__

def __init__(self):
    self.word_freq = Counter()  # 存儲(chǔ)單詞頻率的計(jì)數(shù)器
    self.total_words = 0         # 總單詞數(shù)
    self.unique_words = 0        # 唯一單詞數(shù)
    self.most_common = []        # 最常見的單詞列表
    self.text_source = "未加載文本" # 文本來源信息

文本加載方法

def load_text(self, text):
    """從字符串加載文本"""
    self.text_source = "直接輸入的文本"
    self._process_text(text)
    
def load_file(self, filename):
    """從文件加載文本"""
    try:
        with open(filename, 'r', encoding='utf-8') as file:
            text = file.read()
        self.text_source = filename
        self._process_text(text)
        return True
    except FileNotFoundError:
        print(f"錯(cuò)誤: 文件 '{filename}' 未找到")
        return False
    except Exception as e:
        print(f"讀取文件時(shí)出錯(cuò): {e}")
        return False

load_text: 從用戶輸入的字符串加載文本

load_file: 從文件加載文本，處理文件讀取錯(cuò)誤

核心文本處理方法 _process_text

def _process_text(self, text):
    """處理文本并統(tǒng)計(jì)詞頻"""
    # 轉(zhuǎn)換為小寫并移除標(biāo)點(diǎn)符號(hào)
    cleaned_text = re.sub(r'[^\w\s]', '', text.lower())
    # 分割單詞
    words = cleaned_text.split()
    
    # 更新統(tǒng)計(jì)
    self.word_freq = Counter(words)
    self.total_words = len(words)
    self.unique_words = len(self.word_freq)
    self.most_common = self.word_freq.most_common()

這是程序的核心處理邏輯：

使用正則表達(dá)式移除標(biāo)點(diǎn)符號(hào)
將所有文本轉(zhuǎn)為小寫
使用split()分割單詞
使用Counter統(tǒng)計(jì)詞頻
計(jì)算總單詞數(shù)和唯一單詞數(shù)
獲取最常見的單詞列表

信息獲取方法

def get_total_words(self):
    """獲取總單詞數(shù)"""
    return self.total_words
    
def get_unique_words(self):
    """獲取唯一單詞數(shù)"""
    return self.unique_words
    
def get_most_common(self, n=10):
    """獲取出現(xiàn)頻率最高的前n個(gè)單詞"""
    return self.most_common[:n]
    
def get_word_frequency(self, word):
    """獲取特定單詞的出現(xiàn)頻率"""
    return self.word_freq.get(word.lower(), 0)

這些方法提供對(duì)統(tǒng)計(jì)結(jié)果的訪問接口。

結(jié)果展示方法

def print_summary(self):
    """打印統(tǒng)計(jì)摘要"""
    # 顯示基本信息
    # 顯示最常見的10個(gè)單詞
 
def print_all_frequencies(self):
    """打印所有單詞及其頻率"""
    # 按字母順序顯示所有單詞及其出現(xiàn)次數(shù)
 
def export_to_csv(self, filename="word_frequency.csv"):
    """將詞頻統(tǒng)計(jì)導(dǎo)出到CSV文件"""
    # 創(chuàng)建CSV文件，包含單詞、出現(xiàn)次數(shù)和頻率
 
def visualize_top_words(self, n=15):
    """可視化展示前n個(gè)最常見單詞"""
    # 使用matplotlib創(chuàng)建水平條形圖

這些方法提供了多種結(jié)果展示方式：

控制臺(tái)打印摘要

顯示所有單詞頻率

導(dǎo)出到CSV文件

可視化展示

3. 主函數(shù) main()

這是程序的入口點(diǎn)，提供用戶交互界面：

def main():
    analyzer = WordFrequencyAnalyzer()  # 創(chuàng)建分析器實(shí)例
    
    # 顯示菜單
    while True:
        # 顯示選項(xiàng)菜單
        choice = input("\n請(qǐng)選擇操作: ")
        
        # 處理用戶選擇
        if choice == '1':  # 輸入文本
            # 獲取多行輸入
            # 處理文本
            
        elif choice == '2':  # 從文件加載
            # 獲取文件名
            # 加載文件
            
        elif choice == '3':  # 查看統(tǒng)計(jì)摘要
            # 檢查是否有數(shù)據(jù)
            # 顯示摘要
            
        elif choice == '4':  # 查詢單詞頻率
            # 獲取單詞
            # 查詢并顯示結(jié)果
            
        elif choice == '5':  # 查看所有單詞頻率
            # 檢查是否有數(shù)據(jù)
            # 顯示所有單詞頻率
            
        elif choice == '6':  # 導(dǎo)出到CSV
            # 獲取文件名
            # 導(dǎo)出數(shù)據(jù)
            
        elif choice == '7':  # 可視化展示
            # 獲取要顯示的單詞數(shù)量
            # 顯示圖表
            
        elif choice == '8':  # 退出
            break
        
        else:  # 無效選擇
            print("無效選擇，請(qǐng)重新輸入")

4. 程序入口

if __name__ == "__main__":
    main()

當(dāng)直接運(yùn)行此Python文件時(shí)，會(huì)調(diào)用main()函數(shù)啟動(dòng)程序。

5.關(guān)鍵功能解析

文本處理

cleaned_text = re.sub(r'[^\w\s]', '', text.lower())
words = cleaned_text.split()

使用正則表達(dá)式 [^\w\s] 匹配所有非單詞字符（字母、數(shù)字、下劃線）和非空白字符
將這些字符替換為空字符串，從而移除標(biāo)點(diǎn)符號(hào)
將文本轉(zhuǎn)為小寫，使"Word"和"word"被視為同一個(gè)單詞
使用split()分割單詞

詞頻統(tǒng)計(jì)

self.word_freq = Counter(words)

Counter是Python的高效計(jì)數(shù)工具，可以快速統(tǒng)計(jì)每個(gè)單詞的出現(xiàn)次數(shù)

可視化展示

plt.figure(figsize=(12, 8))
plt.barh(top_words[::-1], counts[::-1], color='skyblue')

創(chuàng)建水平條形圖
使用[::-1]反轉(zhuǎn)列表，使最常見的單詞顯示在頂部
設(shè)置圖表大小和顏色

多行文本輸入

text = input("請(qǐng)輸入文本(輸入空行結(jié)束):\n")
lines = []
while text.strip():
    lines.append(text)
    text = input()
full_text = "\n".join(lines)

允許用戶輸入多行文本

當(dāng)用戶輸入空行時(shí)結(jié)束輸入

將所有行連接成完整文本

這個(gè)程序提供了一個(gè)完整的詞頻分析解決方案，從文本輸入、處理、分析到結(jié)果展示和導(dǎo)出，功能全面且用戶友好。

三、完整代碼

import re
from collections import Counter
import matplotlib.pyplot as plt
 
class WordFrequencyAnalyzer:
    def __init__(self):
        self.word_freq = Counter()
        self.total_words = 0
        self.unique_words = 0
        self.most_common = []
        self.text_source = "未加載文本"
    
    def load_text(self, text):
        """從字符串加載文本"""
        self.text_source = "直接輸入的文本"
        self._process_text(text)
    
    def load_file(self, filename):
        """從文件加載文本"""
        try:
            with open(filename, 'r', encoding='utf-8') as file:
                text = file.read()
            self.text_source = filename
            self._process_text(text)
            return True
        except FileNotFoundError:
            print(f"錯(cuò)誤: 文件 '{filename}' 未找到")
            return False
        except Exception as e:
            print(f"讀取文件時(shí)出錯(cuò): {e}")
            return False
    
    def _process_text(self, text):
        """處理文本并統(tǒng)計(jì)詞頻"""
        # 轉(zhuǎn)換為小寫并移除標(biāo)點(diǎn)符號(hào)
        cleaned_text = re.sub(r'[^\w\s]', '', text.lower())
        # 分割單詞
        words = cleaned_text.split()
        
        # 更新統(tǒng)計(jì)
        self.word_freq = Counter(words)
        self.total_words = len(words)
        self.unique_words = len(self.word_freq)
        self.most_common = self.word_freq.most_common()
    
    def get_total_words(self):
        """獲取總單詞數(shù)"""
        return self.total_words
    
    def get_unique_words(self):
        """獲取唯一單詞數(shù)"""
        return self.unique_words
    
    def get_most_common(self, n=10):
        """獲取出現(xiàn)頻率最高的前n個(gè)單詞"""
        return self.most_common[:n]
    
    def get_word_frequency(self, word):
        """獲取特定單詞的出現(xiàn)頻率"""
        return self.word_freq.get(word.lower(), 0)
    
    def print_summary(self):
        """打印統(tǒng)計(jì)摘要"""
        print("\n===== 文本詞頻統(tǒng)計(jì)摘要 =====")
        print(f"文本來源: {self.text_source}")
        print(f"總單詞數(shù): {self.total_words}")
        print(f"唯一單詞數(shù): {self.unique_words}")
        print(f"詞匯豐富度: {self.unique_words/self.total_words:.2%}")
        
        # 打印最常見的10個(gè)單詞
        print("\n最常見的10個(gè)單詞:")
        for i, (word, count) in enumerate(self.get_most_common(10), 1):
            print(f"{i}. {word}: {count}次 ({count/self.total_words:.2%})")
    
    def print_all_frequencies(self):
        """打印所有單詞及其頻率"""
        print("\n===== 所有單詞頻率 =====")
        for word, count in sorted(self.word_freq.items()):
            print(f"{word}: {count}次")
    
    def export_to_csv(self, filename="word_frequency.csv"):
        """將詞頻統(tǒng)計(jì)導(dǎo)出到CSV文件"""
        try:
            with open(filename, 'w', encoding='utf-8') as file:
                file.write("單詞,出現(xiàn)次數(shù),頻率\n")
                for word, count in self.most_common:
                    frequency = count / self.total_words
                    file.write(f"{word},{count},{frequency:.6f}\n")
            print(f"詞頻統(tǒng)計(jì)已導(dǎo)出到 {filename}")
            return True
        except Exception as e:
            print(f"導(dǎo)出失敗: {e}")
            return False
    
    def visualize_top_words(self, n=15):
        """可視化展示前n個(gè)最常見單詞"""
        if not self.most_common:
            print("沒有可用的數(shù)據(jù)")
            return
        
        top_words = [word for word, _ in self.most_common[:n]]
        counts = [count for _, count in self.most_common[:n]]
        
        plt.figure(figsize=(12, 8))
        plt.barh(top_words[::-1], counts[::-1], color='skyblue')
        plt.xlabel('出現(xiàn)次數(shù)')
        plt.title(f'文本中最常見的 {n} 個(gè)單詞')
        plt.tight_layout()
        plt.show()
 
 
def main():
    analyzer = WordFrequencyAnalyzer()
    
    print("===== 文本詞頻統(tǒng)計(jì)器 =====")
    print("1. 輸入文本")
    print("2. 從文件加載")
    print("3. 查看統(tǒng)計(jì)摘要")
    print("4. 查詢單詞頻率")
    print("5. 查看所有單詞頻率")
    print("6. 導(dǎo)出到CSV")
    print("7. 可視化展示")
    print("8. 退出")
    
    while True:
        choice = input("\n請(qǐng)選擇操作: ")
        
        if choice == '1':
            text = input("請(qǐng)輸入文本(輸入空行結(jié)束):\n")
            lines = []
            while text.strip():
                lines.append(text)
                text = input()
            full_text = "\n".join(lines)
            analyzer.load_text(full_text)
            print(f"已加載文本，共{analyzer.get_total_words()}個(gè)單詞")
        
        elif choice == '2':
            filename = input("請(qǐng)輸入文件名: ")
            if analyzer.load_file(filename):
                print(f"已從文件加載，共{analyzer.get_total_words()}個(gè)單詞")
        
        elif choice == '3':
            if analyzer.total_words > 0:
                analyzer.print_summary()
            else:
                print("請(qǐng)先加載文本")
        
        elif choice == '4':
            if analyzer.total_words > 0:
                word = input("請(qǐng)輸入要查詢的單詞: ").strip()
                count = analyzer.get_word_frequency(word)
                if count > 0:
                    freq = count / analyzer.get_total_words()
                    print(f"單詞 '{word}' 出現(xiàn)了 {count} 次 (頻率: {freq:.2%})")
                else:
                    print(f"單詞 '{word}' 未在文本中出現(xiàn)")
            else:
                print("請(qǐng)先加載文本")
        
        elif choice == '5':
            if analyzer.total_words > 0:
                analyzer.print_all_frequencies()
            else:
                print("請(qǐng)先加載文本")
        
        elif choice == '6':
            if analyzer.total_words > 0:
                filename = input("請(qǐng)輸入導(dǎo)出文件名(默認(rèn): word_frequency.csv): ")
                if not filename:
                    filename = "word_frequency.csv"
                analyzer.export_to_csv(filename)
            else:
                print("請(qǐng)先加載文本")
        
        elif choice == '7':
            if analyzer.total_words > 0:
                n = input("顯示前多少個(gè)單詞? (默認(rèn)15): ")
                try:
                    n = int(n) if n.strip() else 15
                    analyzer.visualize_top_words(n)
                except ValueError:
                    print("請(qǐng)輸入有效數(shù)字")
            else:
                print("請(qǐng)先加載文本")
        
        elif choice == '8':
            print("感謝使用文本詞頻統(tǒng)計(jì)器!")
            break
        
        else:
            print("無效選擇，請(qǐng)重新輸入")
 
 
if __name__ == "__main__":
    main()

到此這篇關(guān)于使用Python編寫詞頻統(tǒng)計(jì)工具的示例代碼的文章就介紹到這了,更多相關(guān)Python詞頻統(tǒng)計(jì)內(nèi)容請(qǐng)搜索腳本之家以前的文章或繼續(xù)瀏覽下面的相關(guān)文章希望大家以后多多支持腳本之家！

您可能感興趣的文章: