aboutsummaryrefslogtreecommitdiffstats
path: root/digester.py
diff options
context:
space:
mode:
Diffstat (limited to 'digester.py')
-rwxr-xr-xdigester.py90
1 files changed, 71 insertions, 19 deletions
diff --git a/digester.py b/digester.py
index 0c17b1c..9f7bff9 100755
--- a/digester.py
+++ b/digester.py
@@ -17,37 +17,67 @@ from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.svm import SVC
+
def SigHandler_SIGINT(signum, frame):
print()
sys.exit(0)
+
class Argparser(object):
def __init__(self):
parser = argparse.ArgumentParser()
parser.add_argument("--string", type=str, help="string")
- parser.add_argument("--bool", action="store_true", help="bool", default=False)
- parser.add_argument("--dbg", action="store_true", help="debug", default=False)
+ parser.add_argument(
+ "--bool", action="store_true", help="bool", default=False
+ )
+ parser.add_argument(
+ "--dbg", action="store_true", help="debug", default=False
+ )
self.args = parser.parse_args()
+
# write code here
def premain(argparser):
signal.signal(signal.SIGINT, SigHandler_SIGINT)
- #here
- dataframe = pd.read_csv('/tmp/features.csv')
+ # here
+ dataframe = pd.read_csv("/tmp/features.csv")
dataframe.head()
y = dataframe.target
- X = dataframe.drop(['target'], axis=1)
+ X = dataframe.drop(["target"], axis=1)
- corpus = X['attr']
+ corpus = X["attr"]
vc = CountVectorizer()
vc.fit(corpus)
- numeric_features = pd.concat([X.drop(['attr'], axis=1), pd.DataFrame(vc.transform(corpus).toarray(), columns=vc.vocabulary_)], axis=1)
+ numeric_features = pd.concat(
+ [
+ X.drop(["attr"], axis=1),
+ pd.DataFrame(
+ vc.transform(corpus).toarray(), columns=vc.vocabulary_
+ ),
+ ],
+ axis=1,
+ )
numeric_features.head()
- plt.scatter(dataframe.index, dataframe.target, color='red', label='target')
- plt.scatter(numeric_features.index, numeric_features.depth, color='green', label='depth')
- plt.scatter(numeric_features.index, numeric_features.text_ratio, color='blue', label='text_ratio')
- plt.scatter(numeric_features.index, numeric_features.alink_text_ratio, color='skyblue', label='alink_text_ratio')
+ plt.scatter(dataframe.index, dataframe.target, color="red", label="target")
+ plt.scatter(
+ numeric_features.index,
+ numeric_features.depth,
+ color="green",
+ label="depth",
+ )
+ plt.scatter(
+ numeric_features.index,
+ numeric_features.text_ratio,
+ color="blue",
+ label="text_ratio",
+ )
+ plt.scatter(
+ numeric_features.index,
+ numeric_features.alink_text_ratio,
+ color="skyblue",
+ label="alink_text_ratio",
+ )
plt.legend(loc=(1, 0))
plt.show()
scaler = preprocessing.StandardScaler()
@@ -56,23 +86,43 @@ def premain(argparser):
# clf = MultinomialNB()
# clf = RandomForestClassifier()
- clf = SVC(C=1, kernel='poly', probability=True)
+ clf = SVC(C=1, kernel="poly", probability=True)
clf.fit(scaled_X, y)
predicted_index = clf.predict(scaled_X).tolist().index(True)
scaled_X = scaler.transform(numeric_features)
pred_y = clf.predict(scaled_X)
- print pd.DataFrame(clf.predict_log_proba(scaled_X),columns=clf.classes_)
- print 'Number of mispredicted out of %d is %d (%.2f%%)' % (y.shape[0], (y!=pred_y).sum(), (y!=pred_y).sum()*100.0/y.shape[0])
- print
- print 'Predicted rows:'
- print dataframe[pred_y].drop(['text_ratio', 'alink_text_ratio', 'contain_title'], axis=1).merge(pd.DataFrame(clf.predict_log_proba(scaled_X)[pred_y],columns=clf.classes_, index=dataframe[pred_y].index), left_index=True, right_index=True)
- print
+ print(pd.DataFrame(clf.predict_log_proba(scaled_X), columns=clf.classes_))
+ print(
+ "Number of mispredicted out of %d is %d (%.2f%%)"
+ % (
+ y.shape[0],
+ (y != pred_y).sum(),
+ (y != pred_y).sum() * 100.0 / y.shape[0],
+ )
+ )
+ print()
+ print("Predicted rows:")
+ print(
+ dataframe[pred_y]
+ .drop(["text_ratio", "alink_text_ratio", "contain_title"], axis=1)
+ .merge(
+ pd.DataFrame(
+ clf.predict_log_proba(scaled_X)[pred_y],
+ columns=clf.classes_,
+ index=dataframe[pred_y].index,
+ ),
+ left_index=True,
+ right_index=True,
+ )
+ )
+ print()
# print 'Acutual rows:'
# print dataframe[dataframe.target]
+
def main():
argparser = Argparser()
if argparser.args.dbg:
@@ -80,7 +130,8 @@ def main():
premain(argparser)
except Exception as e:
print(e.__doc__)
- if e.message: print(e.message)
+ if e.message:
+ print(e.message)
variables = globals().copy()
variables.update(locals())
shell = code.InteractiveConsole(variables)
@@ -88,5 +139,6 @@ def main():
else:
premain(argparser)
+
if __name__ == "__main__":
main()