1000字范文,内容丰富有趣,学习的好帮手!
1000字范文 > Python - 机器学习的分类方法(KNN 朴素贝叶斯 和 决策树)

Python - 机器学习的分类方法(KNN 朴素贝叶斯 和 决策树)

时间:2020-08-11 13:52:58

相关推荐

Python - 机器学习的分类方法(KNN 朴素贝叶斯 和 决策树)

1. KNN:K最近邻法,把所有的训练集数据都加载到内存中,当它需要对测试实例进行分类时,它衡量这个实例的所有训练实例之间的距离,基于距离,它选择训练集里的K个最近的实例。

2. 朴素贝叶斯分类器:贝叶斯是基于“独立假定”的概念,即分类实例之间是相互独立的,例如文档里出现的词是相互独立的,并基于此假定来计算过概率。相关方程在文末有张图表示。文档内正负词分类相对复杂,这里不做详细介绍。

3. 决策树:if-then语句分层组织来构建决策树。优点是易于解释,支持多分类问题,但容易过拟合,类别不平衡的影响十分严重,因此要注意剪枝和按相同分类比例划分数据集。

1. KNN

# -*- coding: utf-8 -*-"""Created on Wed Apr 25 17:15:34 @author: Alvin AI"""from sklearn.cross_validation import StratifiedShuffleSplitfrom sklearn.neighbors import KNeighborsClassifierfrom sklearn.metrics import classification_reportimport numpy as npfrom sklearn.datasets import make_classificationimport matplotlib.pyplot as pltimport itertoolsdef get_data():"""Make a sample classification datasetRetrun : Independent variable y, dependent variable x"""x,y = make_classification(n_features=4)#任意分类100条测试数据(含4种特征值)return x,y#得到训练集和测试集def get_train_test(x,y):train_size = 0.8test_size = 1-train_sizeinput_dataset = np.column_stack([x,y])stratified_split = StratifiedShuffleSplit(input_dataset[:,-1],\test_size=test_size,n_iter=1,random_state = 77)for train_indx,test_indx in stratified_split:train_x = input_dataset[train_indx,:-1]train_y = input_dataset[train_indx,-1]test_x = input_dataset[test_indx,:-1]test_y = input_dataset[test_indx,-1]return train_x,train_y,test_x,test_y#构造模型def build_model(x,y):knn = KNeighborsClassifier(n_neighbors=k)knn.fit(x,y)return knndef test_model(x,y,knn_model):y_predicted = knn_model.predict(x)print classification_report(y,y_predicted)def plot_data(x,y):"""plot a scatter plot for all variable combinations"""subplot_start = 321col_numbers = range(0,4)col_pairs = binations(col_numbers,2)#0 1 2 3 四个数字两两组合for col_pair in col_pairs:plt.subplot(subplot_start)plt.scatter(x[:,col_pair[0]],x[:,col_pair[1]],c=y)title_string = str(col_pair[0]) + "-" +str(col_pair[1])plt.title(title_string)x_label = str(col_pair[0])y_label = str(col_pair[1])plt.xlabel(x_label)plt.ylabel(y_label)subplot_start+=1plt.show()x,y = get_data()plot_data(x,y)if __name__ == "__main__":k=3x,y = get_data()plot_data(x,y)train_x,train_y,test_x,test_y = get_train_test(x,y)knn_model = build_model(train_x,train_y)print "\n model evaluation on training set"print "==================================\n"test_model(train_x,train_y,knn_model)print "\n model evaluation on test set"print "==================================\n"test_model(test_x,test_y,knn_model)

2. 贝叶斯

# -*- coding: utf-8 -*-"""Created on Tue Apr 03 19:05:18 @author: Alvin AI"""from nltk.corpus import movie_reviewsfrom sklearn.cross_validation import StratifiedShuffleSplitimport nltkfrom string import punctuationfrom nltk.corpus import stopwordsfrom nltk.collocations import BigramCollocationFinderfrom nltk.metrics import BigramAssocMeasures#--------载入数据----------------------------def get_data():"""Get movie review data"""dataset = []y_labels =[]#抽取分类for cat in movie_reviews.categories():#对于每个类别下的文件for fileid in movie_reviews.fileids(cat):#获取属于这个分类的词语words = list(movie_reviews.words(fileid))dataset.append((words,cat))y_labels.append(cat)return dataset,y_labels#-------将数据整理成训练集和测试集---------------def get_train_test(input_dataset,ylabels):"""perpare a stratified train and test split"""train_size = 0.7test_size = 1-train_sizestratified_split = StratifiedShuffleSplit(ylabels,\test_size=test_size,n_iter=1,\random_state=77)for train_idx,test_idx in stratified_split:train = [input_dataset[i] for i in train_idx]train_y = [ylabels[i] for i in train_idx]test = [input_dataset[i] for i in test_idx]test_y = [ylabels[i] for i in test_idx]return train,train_y,test,test_y#---------生成特征----------------------------def build_word_features(instance):"""build feature dictionaryfeatures are binary, name of feature is word itselfand value is 1. features are stored in a dictionary called feature_set"""feature_set = {}#用字典来保持特征值words = instance[0]#词列表的实例元组里的第1个子项#把特征保存到字典中for word in words:feature_set[word] = 1return(feature_set,instance[1])#实例元组里的第二个子项是类别标签def build_negate_features(instance):"""if a word is preceeded by either 'not' or 'no'this function adds a prefix 'Not_' to that wordit will also not insert the previous negation word'not' or 'no' in feature dictionary"""#对词进行检索,即实例元组中的第1个子项words = instance[0]final_words =[]negate = False#用一个布尔变量追踪上一个词是不是负面词negate_words = ['no','not']#生成负面词列表#对词进行循环的时候,遇到一个负面词,负面标识变量值变为True#否定词不会加入到特征字典中#当负面标识变量值为True时,词的前面加上一个‘Not_’prefixfor word in words:if negate:word = 'Not_' + wordnegate = False#即每次返回false,以便if语句调用if word not in negate_words:final_words.append(word)else:negate = True#特征字典feature_set = {}for word in final_words:feature_set[word] = 1return(feature_set,instance[1])#-----------移除停留词-----------------------------------def remove_stop_words(in_data):"""Utility function to remove stop words from the given list of words"""stopword_list = stopwords.words('english')negate_words = ['no','not']#我们不希望删除负面词#我们创建一个新的停用词列表来排除负面词new_stopwords = [word for word in stopword_list if word not in negate_words]label = in_data[1]#删除停用词和标点符号words = [word for word in in_data[0] if word not in new_stopwords and word\not in punctuation]return (words,label)def build_keyphrase_features(instance):"""a function to extract key phrases from the given text.key phrases are words of importance according to a measurein this key out phrase of is our length 2, i.e two words of bigrams"""feature_set = {}instance = remove_stop_words(instance)words = instance[0]bigram_finder = BigramCollocationFinder.from_words(words)#我们采用二元特征的原始频率计数#例如,二元特征按出现的频率降序排序,选择前400个bigrams = bigram_finder.nbest(BigramAssocMeasures.raw_freq,400)for bigram in bigrams:feature_set[bigram] = 1return (feature_set,instance[1])#---------------建模------------------------------------def build_model(features):"""build a naive bayes model with the given feature set"""model = nltk.NaiveBayesClassifier.train(features)return modeldef probe_model(model,features,dataset_type='Train'):"""A utility function to check the goodness of our model"""accuracy = nltk.classify.accuracy(model,features)print "\n" + dataset_type + "Accuracy = %0.2f" % (accuracy*100) + "%"def show_features(model,no_features = 5):"""a utility function to see how important various features are for out model"""print "\nFeature importance"print "===================\n"print model.show_most_informative_features(no_features)#---------------改善模型------------------------------def build_model_cycle_1(train_data,dev_data):"""first pass at trying out our model"""#为训练集建立特征train_features = map(build_word_features,train_data)#为测试集建立特征dev_features = map(build_word_features,dev_data)model = build_model(train_features)#建模probe_model(model,train_features)probe_model(model,dev_features,'Dev')return modeldef build_model_cycle_2(train_data,dev_data):"""Second pass at trying out our model"""#为训练集建立特征train_features = map(build_negate_features,train_data)#每一行做映射#为测试集建立特征dev_features = map(build_negate_features,dev_data)model = build_model(train_features)#建模probe_model(model,train_features)probe_model(model,dev_features,'Dev')return modeldef build_model_cycle_3(train_data,dev_data):"""Third pass at trying out our model"""#为训练集建立特征train_features = map(build_keyphrase_features,train_data)#为测试集建立特征dev_features = map(build_keyphrase_features,dev_data)model = build_model(train_features)#建模probe_model(model,train_features)probe_model(model,dev_features,'Dev')test_features = map(build_keyphrase_features,test_data)probe_model(model,test_features,'Test')return model#调用上面定义的各个函数if __name__ == "__main__":#加载数据input_dataset,y_labels = get_data()#训练数据train_data,train_y,all_test_data,\all_test_y=get_train_test(input_dataset,y_labels)#Dev数据dev_data,dev_y,test_data,test_y=get_train_test(all_test_data,all_test_y)#查看不同数据集的大小print "\nOriginal data size =", len(input_dataset)print "\ntraining data size =", len(train_data)print "\ndev data size =", len(dev_data)print "\ntest data size =", len(test_data)#建模的不同过程model_cycle_1 = build_model_cycle_1(train_data,dev_data)#打印输出模型的信息show_features(model_cycle_1)model_cycle_2 = build_model_cycle_2(train_data,dev_data)show_features(model_cycle_2)model_cycle_3 = build_model_cycle_3(train_data,dev_data)show_features(model_cycle_3)

3. 决策树

# -*- coding: utf-8 -*-"""Created on Wed Apr 04 14:47:27 @author: Alvin AI"""from sklearn.datasets import load_irisfrom sklearn.cross_validation import StratifiedShuffleSplitimport numpy as npfrom sklearn import treefrom sklearn.metrics import accuracy_score,classification_report,\confusion_matriximport pprint#import graphviz#载入数据def get_data():"""Get iris data"""data = load_iris()x = data['data']y = data['target']label_names = data['target_names']return x,y,label_names.tolist()#得到训练集和测试集def get_train_test(x,y):train_size = 0.8test_size = 1-train_sizeinput_dataset = np.column_stack([x,y])stratified_split = StratifiedShuffleSplit(input_dataset[:,-1],\test_size=test_size,n_iter=1,random_state = 77)for train_indx,test_indx in stratified_split:train_x = input_dataset[train_indx,:-1]train_y = input_dataset[train_indx,-1]test_x = input_dataset[test_indx,:-1]test_y = input_dataset[test_indx,-1]return train_x,train_y,test_x,test_y#构造模型def build_model(x,y):model = tree.DecisionTreeClassifier(criterion="entropy")model = model.fit(x,y)return model#测试模型def test_model(x,y,model,label_names):y_predicted = model.predict(x)print "Model accuracy = %0.2f" % (accuracy_score(y,y_predicted)*\100) + "%\n"print "\nConfusion Matrix"print "=================="#pprint用于打印出任何python数据结构类和方法,输出比较整齐美观print pprint.pprint(confusion_matrix(y,y_predicted))print "\nclassification report"print "=================="print classification_report(y,y_predicted,target_names=label_names)def get_feature_names():data = load_iris()return data['feature_names']def probe_model(x,y,model,label_names):feature_names = get_feature_names()feature_importance = model.feature_importances_print "\nfeature importance\n"print "======================\n"for i,feature_name in enumerate(feature_names):print "%s = %0.3f" % (feature_name,feature_importance[i])#将决策树导出成图tree.export_graphviz(model,out_file='tree.png')#graphviz.view('E:\\python training\\python data science cookbook\\tree.dot')if __name__ == "__main__":#加载数据x,y,label_names = get_data()#将数据分割为训练集和测试集train_x,train_y,test_x,test_y = get_train_test(x,y)#建模model = build_model(train_x,train_y)#在训练集上评估模型test_model(train_x,train_y,model,label_names)#在测试集上评估模型test_model(test_x,test_y,model,label_names)probe_model(x,y,model,label_names)

本内容不代表本网观点和政治立场,如有侵犯你的权益请联系我们处理。
网友评论
网友评论仅供其表达个人看法,并不表明网站立场。