#!/usr/bin/python3 # _*_ coding=utf-8 _*_ # original source:https://github.com/polyrabbit/hacker-news-digest/blob/master/%5Btutorial%5D%20How-to-extract-main-content-from-web-pages-using-Machine-Learning.ipynb import argparse import code import readline import signal import sys import pandas as pd import numpy as np import matplotlib.pyplot as plt from sklearn import preprocessing from sklearn.model_selection import train_test_split from sklearn.feature_extraction.text import CountVectorizer from sklearn.ensemble import RandomForestClassifier from sklearn.naive_bayes import GaussianNB, MultinomialNB from sklearn.svm import SVC def SigHandler_SIGINT(signum, frame): print() sys.exit(0) class Argparser(object): def __init__(self): parser = argparse.ArgumentParser() parser.add_argument("--string", type=str, help="string") parser.add_argument("--bool", action="store_true", help="bool", default=False) parser.add_argument("--dbg", action="store_true", help="debug", default=False) self.args = parser.parse_args() # write code here def premain(argparser): signal.signal(signal.SIGINT, SigHandler_SIGINT) #here dataframe = pd.read_csv('/tmp/features.csv') dataframe.head() y = dataframe.target X = dataframe.drop(['target'], axis=1) corpus = X['attr'] vc = CountVectorizer() vc.fit(corpus) numeric_features = pd.concat([X.drop(['attr'], axis=1), pd.DataFrame(vc.transform(corpus).toarray(), columns=vc.vocabulary_)], axis=1) numeric_features.head() plt.scatter(dataframe.index, dataframe.target, color='red', label='target') plt.scatter(numeric_features.index, numeric_features.depth, color='green', label='depth') plt.scatter(numeric_features.index, numeric_features.text_ratio, color='blue', label='text_ratio') plt.scatter(numeric_features.index, numeric_features.alink_text_ratio, color='skyblue', label='alink_text_ratio') plt.legend(loc=(1, 0)) plt.show() scaler = preprocessing.StandardScaler() scaler.fit(numeric_features) scaled_X = scaler.transform(numeric_features) # clf = MultinomialNB() # clf = RandomForestClassifier() clf = SVC(C=1, kernel='poly', probability=True) clf.fit(scaled_X, y) predicted_index = clf.predict(scaled_X).tolist().index(True) scaled_X = scaler.transform(numeric_features) pred_y = clf.predict(scaled_X) print pd.DataFrame(clf.predict_log_proba(scaled_X),columns=clf.classes_) print 'Number of mispredicted out of %d is %d (%.2f%%)' % (y.shape[0], (y!=pred_y).sum(), (y!=pred_y).sum()*100.0/y.shape[0]) print print 'Predicted rows:' print dataframe[pred_y].drop(['text_ratio', 'alink_text_ratio', 'contain_title'], axis=1).merge(pd.DataFrame(clf.predict_log_proba(scaled_X)[pred_y],columns=clf.classes_, index=dataframe[pred_y].index), left_index=True, right_index=True) print # print 'Acutual rows:' # print dataframe[dataframe.target] def main(): argparser = Argparser() if argparser.args.dbg: try: premain(argparser) except Exception as e: print(e.__doc__) if e.message: print(e.message) variables = globals().copy() variables.update(locals()) shell = code.InteractiveConsole(variables) shell.interact(banner="DEBUG REPL") else: premain(argparser) if __name__ == "__main__": main()