aboutsummaryrefslogtreecommitdiffstats
path: root/main.py
diff options
context:
space:
mode:
authorterminaldweller <thabogre@gmail.com>2021-07-29 11:41:20 +0000
committerterminaldweller <thabogre@gmail.com>2021-07-29 11:41:20 +0000
commitfab529b1c0bc71fa4c904e7a251038426f81369f (patch)
treea75317cae4dfab07e5042067d1246516f89a44c3 /main.py
parentfirst commit (diff)
downloaddevourer-fab529b1c0bc71fa4c904e7a251038426f81369f.tar.gz
devourer-fab529b1c0bc71fa4c904e7a251038426f81369f.zip
no need for externally getting the links. devourer can do that on its own now. using poetry now. added a dockerfile.
Diffstat (limited to '')
-rwxr-xr-xmain.py65
1 files changed, 58 insertions, 7 deletions
diff --git a/main.py b/main.py
index c9a1af6..00ce6d1 100755
--- a/main.py
+++ b/main.py
@@ -3,24 +3,75 @@
import argparse
import logging
-import traceback
from newspaper import Article, build
-import fileinput
+from bs4 import BeautifulSoup
+from contextlib import closing
+from requests import get
+from requests.exceptions import RequestException
+from re import findall
class Argparser(object):
def __init__(self):
parser = argparse.ArgumentParser()
- parser.add_argument("--string", type=str, help="string")
+ parser.add_argument(
+ "--source",
+ type=str, help="the url where the urls to be extracted reside")
parser.add_argument("--bool", action="store_true",
help="bool", default=False)
- parser.add_argument("--dbg", action="store_true",
- help="debug", default=False)
self.args = parser.parse_args()
+# TODO-maybe actually really do some logging
+def logError(err):
+ print(err)
+
+
+def isAGoodResponse(resp):
+ content_type = resp.headers['Content-Type'].lower()
+ return (resp.status_code == 200 and
+ content_type is not None and content_type.find("html") > -1)
+
+
+def simpleGet(url):
+ try:
+ with closing(get(url, stream=True)) as resp:
+ if isAGoodResponse(resp):
+ return resp.content
+ else:
+ return None
+ except RequestException as e:
+ logError("Error during requests to {0} : {1}".format(url, str(e)))
+ return None
+
+
+def getURLS(source):
+ result = dict()
+ raw_ml = simpleGet(source)
+ ml = BeautifulSoup(raw_ml, "lxml")
+ ml_str = repr(ml)
+ tmp = open("/tmp/riecher", "w")
+ tmp.write(ml_str)
+ tmp.close()
+ tmp = open("/tmp/riecher", "r")
+ dump_list = []
+ for line in tmp:
+ dummy = findall(
+ 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*(),]|'
+ r'(?:%[0-9a-fA-F][0-9a-fA-F]))+', line)
+ dump_list += dummy
+ for elem in dump_list:
+ result[elem] = elem
+ tmp.close()
+ return result
+
+
def main():
- urls = (line for line in fileinput.input())
+ argparser = Argparser()
+ urls = getURLS(argparser.args.source)
+ # import sys
+ # print(urls)
+ # sys.exit(0)
for url in urls:
parser = build(url)
for article in parser.articles:
@@ -30,7 +81,7 @@ def main():
a.parse()
print(a.text)
except Exception as e:
- logging.error(traceback.format_exc(e))
+ logging.exception(e)
if __name__ == "__main__":