#!/usr/bin/python3
# _*_ coding=utf-8 _*_
# original source:https://github.com/polyrabbit/hacker-news-digest/blob/master/%5Btutorial%5D%20How-to-extract-main-content-from-web-pages-using-Machine-Learning.ipynb
import argparse
import code
import readline
import signal
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.svm import SVC
def SigHandler_SIGINT(signum, frame):
print()
sys.exit(0)
class Argparser(object):
def __init__(self):
parser = argparse.ArgumentParser()
parser.add_argument("--string", type=str, help="string")
parser.add_argument(
"--bool", action="store_true", help="bool", default=False
)
parser.add_argument(
"--dbg", action="store_true", help="debug", default=False
)
self.args = parser.parse_args()
# write code here
def premain(argparser):
signal.signal(signal.SIGINT, SigHandler_SIGINT)
# here
dataframe = pd.read_csv("/tmp/features.csv")
dataframe.head()
y = dataframe.target
X = dataframe.drop(["target"], axis=1)
corpus = X["attr"]
vc = CountVectorizer()
vc.fit(corpus)
numeric_features = pd.concat(
[
X.drop(["attr"], axis=1),
pd.DataFrame(
vc.transform(corpus).toarray(), columns=vc.vocabulary_
),
],
axis=1,
)
numeric_features.head()
plt.scatter(dataframe.index, dataframe.target, color="red", label="target")
plt.scatter(
numeric_features.index,
numeric_features.depth,
color="green",
label="depth",
)
plt.scatter(
numeric_features.index,
numeric_features.text_ratio,
color="blue",
label="text_ratio",
)
plt.scatter(
numeric_features.index,
numeric_features.alink_text_ratio,
color="skyblue",
label="alink_text_ratio",
)
plt.legend(loc=(1, 0))
plt.show()
scaler = preprocessing.StandardScaler()
scaler.fit(numeric_features)
scaled_X = scaler.transform(numeric_features)
# clf = MultinomialNB()
# clf = RandomForestClassifier()
clf = SVC(C=1, kernel="poly", probability=True)
clf.fit(scaled_X, y)
predicted_index = clf.predict(scaled_X).tolist().index(True)
scaled_X = scaler.transform(numeric_features)
pred_y = clf.predict(scaled_X)
print(pd.DataFrame(clf.predict_log_proba(scaled_X), columns=clf.classes_))
print(
"Number of mispredicted out of %d is %d (%.2f%%)"
% (
y.shape[0],
(y != pred_y).sum(),
(y != pred_y).sum() * 100.0 / y.shape[0],
)
)
print()
print("Predicted rows:")
print(
dataframe[pred_y]
.drop(["text_ratio", "alink_text_ratio", "contain_title"], axis=1)
.merge(
pd.DataFrame(
clf.predict_log_proba(scaled_X)[pred_y],
columns=clf.classes_,
index=dataframe[pred_y].index,
),
left_index=True,
right_index=True,
)
)
print()
# print 'Acutual rows:'
# print dataframe[dataframe.target]
def main():
argparser = Argparser()
if argparser.args.dbg:
try:
premain(argparser)
except Exception as e:
print(e.__doc__)
if e.message:
print(e.message)
variables = globals().copy()
variables.update(locals())
shell = code.InteractiveConsole(variables)
shell.interact(banner="DEBUG REPL")
else:
premain(argparser)
if __name__ == "__main__":
main()