aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rwxr-xr-xdigester.py92
1 files changed, 92 insertions, 0 deletions
diff --git a/digester.py b/digester.py
new file mode 100755
index 0000000..0c17b1c
--- /dev/null
+++ b/digester.py
@@ -0,0 +1,92 @@
+#!/usr/bin/python3
+# _*_ coding=utf-8 _*_
+# original source:https://github.com/polyrabbit/hacker-news-digest/blob/master/%5Btutorial%5D%20How-to-extract-main-content-from-web-pages-using-Machine-Learning.ipynb
+
+import argparse
+import code
+import readline
+import signal
+import sys
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+from sklearn import preprocessing
+from sklearn.model_selection import train_test_split
+from sklearn.feature_extraction.text import CountVectorizer
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.naive_bayes import GaussianNB, MultinomialNB
+from sklearn.svm import SVC
+
+def SigHandler_SIGINT(signum, frame):
+ print()
+ sys.exit(0)
+
+class Argparser(object):
+ def __init__(self):
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--string", type=str, help="string")
+ parser.add_argument("--bool", action="store_true", help="bool", default=False)
+ parser.add_argument("--dbg", action="store_true", help="debug", default=False)
+ self.args = parser.parse_args()
+
+# write code here
+def premain(argparser):
+ signal.signal(signal.SIGINT, SigHandler_SIGINT)
+ #here
+ dataframe = pd.read_csv('/tmp/features.csv')
+ dataframe.head()
+ y = dataframe.target
+ X = dataframe.drop(['target'], axis=1)
+
+ corpus = X['attr']
+ vc = CountVectorizer()
+ vc.fit(corpus)
+
+ numeric_features = pd.concat([X.drop(['attr'], axis=1), pd.DataFrame(vc.transform(corpus).toarray(), columns=vc.vocabulary_)], axis=1)
+ numeric_features.head()
+ plt.scatter(dataframe.index, dataframe.target, color='red', label='target')
+ plt.scatter(numeric_features.index, numeric_features.depth, color='green', label='depth')
+ plt.scatter(numeric_features.index, numeric_features.text_ratio, color='blue', label='text_ratio')
+ plt.scatter(numeric_features.index, numeric_features.alink_text_ratio, color='skyblue', label='alink_text_ratio')
+ plt.legend(loc=(1, 0))
+ plt.show()
+ scaler = preprocessing.StandardScaler()
+ scaler.fit(numeric_features)
+ scaled_X = scaler.transform(numeric_features)
+
+ # clf = MultinomialNB()
+ # clf = RandomForestClassifier()
+ clf = SVC(C=1, kernel='poly', probability=True)
+ clf.fit(scaled_X, y)
+ predicted_index = clf.predict(scaled_X).tolist().index(True)
+
+ scaled_X = scaler.transform(numeric_features)
+ pred_y = clf.predict(scaled_X)
+
+ print pd.DataFrame(clf.predict_log_proba(scaled_X),columns=clf.classes_)
+ print 'Number of mispredicted out of %d is %d (%.2f%%)' % (y.shape[0], (y!=pred_y).sum(), (y!=pred_y).sum()*100.0/y.shape[0])
+ print
+ print 'Predicted rows:'
+ print dataframe[pred_y].drop(['text_ratio', 'alink_text_ratio', 'contain_title'], axis=1).merge(pd.DataFrame(clf.predict_log_proba(scaled_X)[pred_y],columns=clf.classes_, index=dataframe[pred_y].index), left_index=True, right_index=True)
+ print
+
+ # print 'Acutual rows:'
+ # print dataframe[dataframe.target]
+
+def main():
+ argparser = Argparser()
+ if argparser.args.dbg:
+ try:
+ premain(argparser)
+ except Exception as e:
+ print(e.__doc__)
+ if e.message: print(e.message)
+ variables = globals().copy()
+ variables.update(locals())
+ shell = code.InteractiveConsole(variables)
+ shell.interact(banner="DEBUG REPL")
+ else:
+ premain(argparser)
+
+if __name__ == "__main__":
+ main()