diff options
author | bloodstalker <thabogre@gmail.com> | 2018-10-09 15:08:34 +0000 |
---|---|---|
committer | bloodstalker <thabogre@gmail.com> | 2018-10-09 15:08:34 +0000 |
commit | 4545ee850bd928a3a6c9982f0606baebc2e138bd (patch) | |
tree | 4e973a652a042a9fc7c5491447bf77ae0cf1fed5 | |
parent | update (diff) | |
download | seer-4545ee850bd928a3a6c9982f0606baebc2e138bd.tar.gz seer-4545ee850bd928a3a6c9982f0606baebc2e138bd.zip |
update
-rwxr-xr-x | digester.py | 92 |
1 files changed, 92 insertions, 0 deletions
diff --git a/digester.py b/digester.py new file mode 100755 index 0000000..0c17b1c --- /dev/null +++ b/digester.py @@ -0,0 +1,92 @@ +#!/usr/bin/python3 +# _*_ coding=utf-8 _*_ +# original source:https://github.com/polyrabbit/hacker-news-digest/blob/master/%5Btutorial%5D%20How-to-extract-main-content-from-web-pages-using-Machine-Learning.ipynb + +import argparse +import code +import readline +import signal +import sys +import pandas as pd +import numpy as np +import matplotlib.pyplot as plt +from sklearn import preprocessing +from sklearn.model_selection import train_test_split +from sklearn.feature_extraction.text import CountVectorizer +from sklearn.ensemble import RandomForestClassifier +from sklearn.naive_bayes import GaussianNB, MultinomialNB +from sklearn.svm import SVC + +def SigHandler_SIGINT(signum, frame): + print() + sys.exit(0) + +class Argparser(object): + def __init__(self): + parser = argparse.ArgumentParser() + parser.add_argument("--string", type=str, help="string") + parser.add_argument("--bool", action="store_true", help="bool", default=False) + parser.add_argument("--dbg", action="store_true", help="debug", default=False) + self.args = parser.parse_args() + +# write code here +def premain(argparser): + signal.signal(signal.SIGINT, SigHandler_SIGINT) + #here + dataframe = pd.read_csv('/tmp/features.csv') + dataframe.head() + y = dataframe.target + X = dataframe.drop(['target'], axis=1) + + corpus = X['attr'] + vc = CountVectorizer() + vc.fit(corpus) + + numeric_features = pd.concat([X.drop(['attr'], axis=1), pd.DataFrame(vc.transform(corpus).toarray(), columns=vc.vocabulary_)], axis=1) + numeric_features.head() + plt.scatter(dataframe.index, dataframe.target, color='red', label='target') + plt.scatter(numeric_features.index, numeric_features.depth, color='green', label='depth') + plt.scatter(numeric_features.index, numeric_features.text_ratio, color='blue', label='text_ratio') + plt.scatter(numeric_features.index, numeric_features.alink_text_ratio, color='skyblue', label='alink_text_ratio') + plt.legend(loc=(1, 0)) + plt.show() + scaler = preprocessing.StandardScaler() + scaler.fit(numeric_features) + scaled_X = scaler.transform(numeric_features) + + # clf = MultinomialNB() + # clf = RandomForestClassifier() + clf = SVC(C=1, kernel='poly', probability=True) + clf.fit(scaled_X, y) + predicted_index = clf.predict(scaled_X).tolist().index(True) + + scaled_X = scaler.transform(numeric_features) + pred_y = clf.predict(scaled_X) + + print pd.DataFrame(clf.predict_log_proba(scaled_X),columns=clf.classes_) + print 'Number of mispredicted out of %d is %d (%.2f%%)' % (y.shape[0], (y!=pred_y).sum(), (y!=pred_y).sum()*100.0/y.shape[0]) + print + print 'Predicted rows:' + print dataframe[pred_y].drop(['text_ratio', 'alink_text_ratio', 'contain_title'], axis=1).merge(pd.DataFrame(clf.predict_log_proba(scaled_X)[pred_y],columns=clf.classes_, index=dataframe[pred_y].index), left_index=True, right_index=True) + print + + # print 'Acutual rows:' + # print dataframe[dataframe.target] + +def main(): + argparser = Argparser() + if argparser.args.dbg: + try: + premain(argparser) + except Exception as e: + print(e.__doc__) + if e.message: print(e.message) + variables = globals().copy() + variables.update(locals()) + shell = code.InteractiveConsole(variables) + shell.interact(banner="DEBUG REPL") + else: + premain(argparser) + +if __name__ == "__main__": + main() |