aboutsummaryrefslogblamecommitdiffstats
path: root/digester.py
blob: 9f7bff90daf625348f79efbcfc906d3b7e83b3d9 (plain) (tree)


















                                                                                                                                                                       
 



                                     
 



                                                                





                                                                     

                                       
 


                                                   

                                                

                        
                                          
 
                      


                          








                                                                      
                           


















                                                                               







                                                 
                                                   





                                                                
























                                                                              



                                       
 






                              

                                






                                                      
 

                          
#!/usr/bin/python3
# _*_ coding=utf-8 _*_
# original source:https://github.com/polyrabbit/hacker-news-digest/blob/master/%5Btutorial%5D%20How-to-extract-main-content-from-web-pages-using-Machine-Learning.ipynb

import argparse
import code
import readline
import signal
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.svm import SVC


def SigHandler_SIGINT(signum, frame):
    print()
    sys.exit(0)


class Argparser(object):
    def __init__(self):
        parser = argparse.ArgumentParser()
        parser.add_argument("--string", type=str, help="string")
        parser.add_argument(
            "--bool", action="store_true", help="bool", default=False
        )
        parser.add_argument(
            "--dbg", action="store_true", help="debug", default=False
        )
        self.args = parser.parse_args()


# write code here
def premain(argparser):
    signal.signal(signal.SIGINT, SigHandler_SIGINT)
    # here
    dataframe = pd.read_csv("/tmp/features.csv")
    dataframe.head()
    y = dataframe.target
    X = dataframe.drop(["target"], axis=1)

    corpus = X["attr"]
    vc = CountVectorizer()
    vc.fit(corpus)

    numeric_features = pd.concat(
        [
            X.drop(["attr"], axis=1),
            pd.DataFrame(
                vc.transform(corpus).toarray(), columns=vc.vocabulary_
            ),
        ],
        axis=1,
    )
    numeric_features.head()
    plt.scatter(dataframe.index, dataframe.target, color="red", label="target")
    plt.scatter(
        numeric_features.index,
        numeric_features.depth,
        color="green",
        label="depth",
    )
    plt.scatter(
        numeric_features.index,
        numeric_features.text_ratio,
        color="blue",
        label="text_ratio",
    )
    plt.scatter(
        numeric_features.index,
        numeric_features.alink_text_ratio,
        color="skyblue",
        label="alink_text_ratio",
    )
    plt.legend(loc=(1, 0))
    plt.show()
    scaler = preprocessing.StandardScaler()
    scaler.fit(numeric_features)
    scaled_X = scaler.transform(numeric_features)

    # clf = MultinomialNB()
    # clf = RandomForestClassifier()
    clf = SVC(C=1, kernel="poly", probability=True)
    clf.fit(scaled_X, y)
    predicted_index = clf.predict(scaled_X).tolist().index(True)

    scaled_X = scaler.transform(numeric_features)
    pred_y = clf.predict(scaled_X)

    print(pd.DataFrame(clf.predict_log_proba(scaled_X), columns=clf.classes_))
    print(
        "Number of mispredicted out of %d is %d (%.2f%%)"
        % (
            y.shape[0],
            (y != pred_y).sum(),
            (y != pred_y).sum() * 100.0 / y.shape[0],
        )
    )
    print()
    print("Predicted rows:")
    print(
        dataframe[pred_y]
        .drop(["text_ratio", "alink_text_ratio", "contain_title"], axis=1)
        .merge(
            pd.DataFrame(
                clf.predict_log_proba(scaled_X)[pred_y],
                columns=clf.classes_,
                index=dataframe[pred_y].index,
            ),
            left_index=True,
            right_index=True,
        )
    )
    print()

    # print 'Acutual rows:'
    # print dataframe[dataframe.target]


def main():
    argparser = Argparser()
    if argparser.args.dbg:
        try:
            premain(argparser)
        except Exception as e:
            print(e.__doc__)
            if e.message:
                print(e.message)
            variables = globals().copy()
            variables.update(locals())
            shell = code.InteractiveConsole(variables)
            shell.interact(banner="DEBUG REPL")
    else:
        premain(argparser)


if __name__ == "__main__":
    main()