black and update

author: terminaldweller <thabogre@gmail.com> 2022-01-27 17:51:54 +0000
committer: terminaldweller <thabogre@gmail.com> 2022-01-27 17:51:54 +0000
commit: 02c8661250be26dc35b71c7fa9fb0f2eb9890b44 (patch)
tree: 708839587fb6e16b6e37465e15259461fb0b13fe /digester.py
parent: update (diff)
download: seer-02c8661250be26dc35b71c7fa9fb0f2eb9890b44.tar.gz
seer-02c8661250be26dc35b71c7fa9fb0f2eb9890b44.zip
1 files changed, 71 insertions, 19 deletions
diff --git a/digester.py b/digester.py
index 0c17b1c..9f7bff9 100755
--- a/digester.py
+++ b/digester.py
@@ -17,37 +17,67 @@ from sklearn.ensemble import RandomForestClassifier
 from sklearn.naive_bayes import GaussianNB, MultinomialNB
 from sklearn.svm import SVC
 
+
 def SigHandler_SIGINT(signum, frame):
     print()
     sys.exit(0)
 
+
 class Argparser(object):
     def __init__(self):
         parser = argparse.ArgumentParser()
         parser.add_argument("--string", type=str, help="string")
-        parser.add_argument("--bool", action="store_true", help="bool", default=False)
-        parser.add_argument("--dbg", action="store_true", help="debug", default=False)
+        parser.add_argument(
+            "--bool", action="store_true", help="bool", default=False
+        )
+        parser.add_argument(
+            "--dbg", action="store_true", help="debug", default=False
+        )
         self.args = parser.parse_args()
 
+
 # write code here
 def premain(argparser):
     signal.signal(signal.SIGINT, SigHandler_SIGINT)
-    #here
-    dataframe = pd.read_csv('/tmp/features.csv')
+    # here
+    dataframe = pd.read_csv("/tmp/features.csv")
     dataframe.head()
     y = dataframe.target
-    X = dataframe.drop(['target'], axis=1)
+    X = dataframe.drop(["target"], axis=1)
 
-    corpus = X['attr']
+    corpus = X["attr"]
     vc = CountVectorizer()
     vc.fit(corpus)
 
-    numeric_features = pd.concat([X.drop(['attr'], axis=1), pd.DataFrame(vc.transform(corpus).toarray(), columns=vc.vocabulary_)], axis=1)
+    numeric_features = pd.concat(
+        [
+            X.drop(["attr"], axis=1),
+            pd.DataFrame(
+                vc.transform(corpus).toarray(), columns=vc.vocabulary_
+            ),
+        ],
+        axis=1,
+    )
     numeric_features.head()
-    plt.scatter(dataframe.index, dataframe.target, color='red', label='target')
-    plt.scatter(numeric_features.index, numeric_features.depth, color='green', label='depth')
-    plt.scatter(numeric_features.index, numeric_features.text_ratio, color='blue', label='text_ratio')
-    plt.scatter(numeric_features.index, numeric_features.alink_text_ratio, color='skyblue', label='alink_text_ratio')
+    plt.scatter(dataframe.index, dataframe.target, color="red", label="target")
+    plt.scatter(
+        numeric_features.index,
+        numeric_features.depth,
+        color="green",
+        label="depth",
+    )
+    plt.scatter(
+        numeric_features.index,
+        numeric_features.text_ratio,
+        color="blue",
+        label="text_ratio",
+    )
+    plt.scatter(
+        numeric_features.index,
+        numeric_features.alink_text_ratio,
+        color="skyblue",
+        label="alink_text_ratio",
+    )
     plt.legend(loc=(1, 0))
     plt.show()
     scaler = preprocessing.StandardScaler()
@@ -56,23 +86,43 @@ def premain(argparser):
 
     # clf = MultinomialNB()
     # clf = RandomForestClassifier()
-    clf = SVC(C=1, kernel='poly', probability=True)
+    clf = SVC(C=1, kernel="poly", probability=True)
     clf.fit(scaled_X, y)
     predicted_index = clf.predict(scaled_X).tolist().index(True)
 
     scaled_X = scaler.transform(numeric_features)
     pred_y = clf.predict(scaled_X)
 
-    print pd.DataFrame(clf.predict_log_proba(scaled_X),columns=clf.classes_)
-    print 'Number of mispredicted out of %d is %d (%.2f%%)' % (y.shape[0], (y!=pred_y).sum(), (y!=pred_y).sum()*100.0/y.shape[0])
-    print
-    print 'Predicted rows:'
-    print dataframe[pred_y].drop(['text_ratio', 'alink_text_ratio', 'contain_title'], axis=1).merge(pd.DataFrame(clf.predict_log_proba(scaled_X)[pred_y],columns=clf.classes_, index=dataframe[pred_y].index), left_index=True, right_index=True)
-    print
+    print(pd.DataFrame(clf.predict_log_proba(scaled_X), columns=clf.classes_))
+    print(
+        "Number of mispredicted out of %d is %d (%.2f%%)"
+        % (
+            y.shape[0],
+            (y != pred_y).sum(),
+            (y != pred_y).sum() * 100.0 / y.shape[0],
+        )
+    )
+    print()
+    print("Predicted rows:")
+    print(
+        dataframe[pred_y]
+        .drop(["text_ratio", "alink_text_ratio", "contain_title"], axis=1)
+        .merge(
+            pd.DataFrame(
+                clf.predict_log_proba(scaled_X)[pred_y],
+                columns=clf.classes_,
+                index=dataframe[pred_y].index,
+            ),
+            left_index=True,
+            right_index=True,
+        )
+    )
+    print()
 
     # print 'Acutual rows:'
     # print dataframe[dataframe.target]
 
+
 def main():
     argparser = Argparser()
     if argparser.args.dbg:
@@ -80,7 +130,8 @@ def main():
             premain(argparser)
         except Exception as e:
             print(e.__doc__)
-            if e.message: print(e.message)
+            if e.message:
+                print(e.message)
             variables = globals().copy()
             variables.update(locals())
             shell = code.InteractiveConsole(variables)
@@ -88,5 +139,6 @@ def main():
     else:
         premain(argparser)
 
+
 if __name__ == "__main__":
     main()
author	terminaldweller <thabogre@gmail.com>	2022-01-27 17:51:54 +0000
committer	terminaldweller <thabogre@gmail.com>	2022-01-27 17:51:54 +0000
commit	02c8661250be26dc35b71c7fa9fb0f2eb9890b44 (patch)
tree	708839587fb6e16b6e37465e15259461fb0b13fe /digester.py
parent	update (diff)
download	seer-02c8661250be26dc35b71c7fa9fb0f2eb9890b44.tar.gz seer-02c8661250be26dc35b71c7fa9fb0f2eb9890b44.zip