for email in ham_email_list: for word in email: word_set.add(word) for email in spam_email_list: for word in email: word_set.add(word) # 计算每个词在正常邮件出现的次数
word_dict = {}
for word in word_set: word_dict[word] = 0
for email in ham_email_list: for word1 in email: if (word==word1):
defget_spam_dic(ham_email_list,spam_email_list): all_words = [] word_set = set() # 记录所有种类的单词,正常邮件和垃圾邮件种类的单词 for email in ham_email_list: for word in email: word_set.add(word) for email in spam_email_list: for word in email: word_set.add(word)
# 计算每个词在垃圾邮件出现的次数
word_dict = {}
for word in word_set: word_dict[word] = 0
for email in spam_email_list: for word1 in email: if (word==word1):
# 计算在正常邮件中出现的概率 defget_ham_rate(filename,ham_w_dict): withopen(filename,mode="r") as f: content = f.read() content = clear_content(content) test_set = set() for word in content: test_set.add(word)
ham_email_num = len(os.listdir(f"data/ham")) # 记录每个词的数目 ham_num = [] for x in test_set: for w in ham_w_dict: if x==w: ham_num.append(ham_w_dict[w])
# 拉普拉斯平滑 laplasi = 1 # 这里采用了加法,因为乘法会过小,相当于用到了log,后面会有体现 for num in ham_num: laplasi += num ham_rate = laplasi/(ham_email_num+2) return ham_rate
# 计算在垃圾邮件中出现的概率 defget_spam_rate(filename,spam_w_dict): withopen(filename,mode="r") as f: content = f.read() content = clear_content(content) test_set = set() for word in content: test_set.add(word)
spam_email_num = len(os.listdir(f"data/spam")) # 记录每个词的数目 spam_num = [] for x in test_set: for w in spam_w_dict: if x==w: spam_num.append(spam_w_dict[w])
# 拉普拉斯平滑 laplasi = 1 # 这里采用了加法,因为乘法会过小,相当于用到了log,后面会有体现 for num in spam_num: laplasi += num spam_rate = laplasi/(spam_email_num+2) return spam_rate
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
~~~
~~~python import numpy as np defemail_divide(folderpath):
for filename in os.listdir(folderpath): file_path = os.path.join(folderpath,filename) print(f"{file_path}") ham = get_ham_rate(file_path,ham_w_dict)+ np.log(1 / 2) spam = get_spam_rate(file_path,spam_w_dict)+ np.log(1 / 2) if spam > ham: print('p1>p2,所以是垃圾邮件.') else: print('p1<p2,所以是正常邮件.') email_divide("data/test")