diff options
Diffstat (limited to 'digester.py')
-rwxr-xr-x | digester.py | 90 |
1 files changed, 71 insertions, 19 deletions
diff --git a/digester.py b/digester.py index 0c17b1c..9f7bff9 100755 --- a/digester.py +++ b/digester.py @@ -17,37 +17,67 @@ from sklearn.ensemble import RandomForestClassifier from sklearn.naive_bayes import GaussianNB, MultinomialNB from sklearn.svm import SVC + def SigHandler_SIGINT(signum, frame): print() sys.exit(0) + class Argparser(object): def __init__(self): parser = argparse.ArgumentParser() parser.add_argument("--string", type=str, help="string") - parser.add_argument("--bool", action="store_true", help="bool", default=False) - parser.add_argument("--dbg", action="store_true", help="debug", default=False) + parser.add_argument( + "--bool", action="store_true", help="bool", default=False + ) + parser.add_argument( + "--dbg", action="store_true", help="debug", default=False + ) self.args = parser.parse_args() + # write code here def premain(argparser): signal.signal(signal.SIGINT, SigHandler_SIGINT) - #here - dataframe = pd.read_csv('/tmp/features.csv') + # here + dataframe = pd.read_csv("/tmp/features.csv") dataframe.head() y = dataframe.target - X = dataframe.drop(['target'], axis=1) + X = dataframe.drop(["target"], axis=1) - corpus = X['attr'] + corpus = X["attr"] vc = CountVectorizer() vc.fit(corpus) - numeric_features = pd.concat([X.drop(['attr'], axis=1), pd.DataFrame(vc.transform(corpus).toarray(), columns=vc.vocabulary_)], axis=1) + numeric_features = pd.concat( + [ + X.drop(["attr"], axis=1), + pd.DataFrame( + vc.transform(corpus).toarray(), columns=vc.vocabulary_ + ), + ], + axis=1, + ) numeric_features.head() - plt.scatter(dataframe.index, dataframe.target, color='red', label='target') - plt.scatter(numeric_features.index, numeric_features.depth, color='green', label='depth') - plt.scatter(numeric_features.index, numeric_features.text_ratio, color='blue', label='text_ratio') - plt.scatter(numeric_features.index, numeric_features.alink_text_ratio, color='skyblue', label='alink_text_ratio') + plt.scatter(dataframe.index, dataframe.target, color="red", label="target") + plt.scatter( + numeric_features.index, + numeric_features.depth, + color="green", + label="depth", + ) + plt.scatter( + numeric_features.index, + numeric_features.text_ratio, + color="blue", + label="text_ratio", + ) + plt.scatter( + numeric_features.index, + numeric_features.alink_text_ratio, + color="skyblue", + label="alink_text_ratio", + ) plt.legend(loc=(1, 0)) plt.show() scaler = preprocessing.StandardScaler() @@ -56,23 +86,43 @@ def premain(argparser): # clf = MultinomialNB() # clf = RandomForestClassifier() - clf = SVC(C=1, kernel='poly', probability=True) + clf = SVC(C=1, kernel="poly", probability=True) clf.fit(scaled_X, y) predicted_index = clf.predict(scaled_X).tolist().index(True) scaled_X = scaler.transform(numeric_features) pred_y = clf.predict(scaled_X) - print pd.DataFrame(clf.predict_log_proba(scaled_X),columns=clf.classes_) - print 'Number of mispredicted out of %d is %d (%.2f%%)' % (y.shape[0], (y!=pred_y).sum(), (y!=pred_y).sum()*100.0/y.shape[0]) - print - print 'Predicted rows:' - print dataframe[pred_y].drop(['text_ratio', 'alink_text_ratio', 'contain_title'], axis=1).merge(pd.DataFrame(clf.predict_log_proba(scaled_X)[pred_y],columns=clf.classes_, index=dataframe[pred_y].index), left_index=True, right_index=True) - print + print(pd.DataFrame(clf.predict_log_proba(scaled_X), columns=clf.classes_)) + print( + "Number of mispredicted out of %d is %d (%.2f%%)" + % ( + y.shape[0], + (y != pred_y).sum(), + (y != pred_y).sum() * 100.0 / y.shape[0], + ) + ) + print() + print("Predicted rows:") + print( + dataframe[pred_y] + .drop(["text_ratio", "alink_text_ratio", "contain_title"], axis=1) + .merge( + pd.DataFrame( + clf.predict_log_proba(scaled_X)[pred_y], + columns=clf.classes_, + index=dataframe[pred_y].index, + ), + left_index=True, + right_index=True, + ) + ) + print() # print 'Acutual rows:' # print dataframe[dataframe.target] + def main(): argparser = Argparser() if argparser.args.dbg: @@ -80,7 +130,8 @@ def main(): premain(argparser) except Exception as e: print(e.__doc__) - if e.message: print(e.message) + if e.message: + print(e.message) variables = globals().copy() variables.update(locals()) shell = code.InteractiveConsole(variables) @@ -88,5 +139,6 @@ def main(): else: premain(argparser) + if __name__ == "__main__": main() |