1 files changed, 95 insertions, 53 deletions
diff --git a/devourer.py b/devourer.py
index 9188d09..94358dd 100755
--- a/devourer.py
+++ b/devourer.py
@@ -9,6 +9,8 @@ import tika
 import docker
 import os
 import nltk
+import random
+import string
 from newspaper import Article, build, Config
 from bs4 import BeautifulSoup
 from contextlib import closing
@@ -26,32 +28,60 @@ WIKIPEDIA_SEARCH_URL = "https://en.wikipedia.org/w/api.php"
 class Argparser(object):
     def __init__(self):
         parser = argparse.ArgumentParser()
-        parser.add_argument("--source",
-                            type=str, help="the url where the \
-                            urls to be extracted reside")
-        parser.add_argument("--out", type=str,
-                            help="the output file", default="")
-        parser.add_argument("--singlelink", action="store_true",
-                            help="whether the app should work in single-link \
+        parser.add_argument(
+            "--source",
+            type=str,
+            help="the url where the \
+                            urls to be extracted reside",
+            default="",
+        )
+        parser.add_argument(
+            "--out",
+            type=str,
+            help="the output file name if it applies",
+            default="",
+        )
+        parser.add_argument(
+            "--singlelink",
+            action="store_true",
+            help="whether the app should work in single-link \
                             meaning only one page's contents will be used \
-                            mode", default=False)
-        parser.add_argument("--multilink", action="store_true",
-                            help="whether the app should work in multi-link \
+                            mode",
+            default=False,
+        )
+        parser.add_argument(
+            "--multilink",
+            action="store_true",
+            help="whether the app should work in multi-link \
                             mode meaning the srouce contians a list of links \
                             rather than being the actual source itself",
-                            default=False)
-        parser.add_argument("--sourcetype", type=str,
-                            help="determines the type of the \
-                            source.html,text,...")
-        parser.add_argument("--pdftomp3", action="store_true",
-                            default=False, help="convert pdf to mp3. \
+            default=False,
+        )
+        parser.add_argument(
+            "--sourcetype",
+            type=str,
+            help="determines the type of the \
+                            source:html,text,...",
+            default="html",
+        )
+        parser.add_argument(
+            "--pdftomp3",
+            action="store_true",
+            default=False,
+            help="convert pdf to mp3. \
                             source should be the path to a pdf file and\
-                            out should be the path to the mp3 output file")
-        parser.add_argument("--summary", type=str, default="newspaper",
-                            help="which summary type to use. currently we \
-                            have newspaper, bart and none.")
-        parser.add_argument("--search", type=str,
-                            default="", help="the search query")
+                            out should be the path to the mp3 output file",
+        )
+        parser.add_argument(
+            "--summary",
+            type=str,
+            default="newspaper",
+            help="which summary type to use. currently we \
+                            have newspaper, bart and none.",
+        )
+        parser.add_argument(
+            "--search", type=str, default="", help="the string to search for"
+        )
         self.args = parser.parse_args()
 
 
@@ -63,9 +93,8 @@ def logError(err: RequestException) -> None:
 
 def isAGoodResponse(resp: Response) -> bool:
     """Checks whether the get we sent got a 200 response."""
-    content_type = resp.headers['Content-Type'].lower()
-    return (resp.status_code == 200 and
-            content_type is not None)
+    content_type = resp.headers["Content-Type"].lower()
+    return resp.status_code == 200 and content_type is not None
 
 
 def simpleGet(url: str) -> bytes:
@@ -94,23 +123,32 @@ def getWithParams(url: str, params: dict) -> dict:
         return None
 
 
+def getRandStr(n):
+    """Return a random string of the given length."""
+    return "".join([random.choice(string.lowercase) for i in range(n)])
+
+
 def getURLS(source: str) -> dict:
     """Extracts the urls from a website."""
     result = dict()
     raw_ml = simpleGet(source)
     ml = BeautifulSoup(raw_ml, "lxml")
+
+    rand_tmp = "/tmp/" + getRandStr(20)
     ml_str = repr(ml)
-    tmp = open("/tmp/riecher", "w")
+    tmp = open(rand_tmp, "w")
     tmp.write(ml_str)
     tmp.close()
-    tmp = open("/tmp/riecher", "r")
-    dump_list = []
+    tmp = open(rand_tmp, "r")
+    url_list = []
     for line in tmp:
-        dummy = findall(
-            'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*(),]|'
-            r'(?:%[0-9a-fA-F][0-9a-fA-F]))+', line)
-        dump_list += dummy
-    for elem in dump_list:
+        url = findall(
+            "http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*(),]|"
+            r"(?:%[0-9a-fA-F][0-9a-fA-F]))+",
+            line,
+        )
+        url_list += url
+    for elem in url_list:
         result[elem] = elem
     tmp.close()
     return result
@@ -138,16 +176,16 @@ def pdfToVoice(argparser: Argparser) -> None:
     TIKA_SERVER_ENDPOINT = "127.0.0.1:9977"
     os.environ["TIKA_SERVER_ENDPOINT"] = TIKA_SERVER_ENDPOINT
     dockerClient = docker.from_env()
-    container = dockerClient.containers.run("apache/tika:2.0.0", detach=True,
-                                            ports={TIKA_SERVER_ENDPOINT:
-                                                   "9998"})
+    container = dockerClient.containers.run(
+        "apache/tika:2.0.0", detach=True, ports={TIKA_SERVER_ENDPOINT: "9998"}
+    )
     while True:
         resp = get("http://127.0.0.1:9977")
         if resp.status_code == 200:
             break
-        time.sleep(.5)
+        time.sleep(0.5)
     rawText = tika.parser.from_file()
-    tts = gTTS(rawText['content'])
+    tts = gTTS(rawText["content"])
     tts.save(argparser.args.out)
     container.stop()
     dockerClient.close()
@@ -155,7 +193,7 @@ def pdfToVoice(argparser: Argparser) -> None:
 
 def extractRequirements(textBody: str) -> list:
     """Extract the sentences containing the keywords
-     that denote a requirement."""
+    that denote a requirement."""
     result = []
     REQ_KEYWORDS = ["shall", "should", "must", "may", "can", "could"]
     nltk.download("punkt")
@@ -169,28 +207,32 @@ def extractRequirements(textBody: str) -> list:
 
 def summarizeText(text: str) -> str:
     """Summarize the given text using bart."""
-    from transformers import BartTokenizer, BartForConditionalGeneration
-    model = BartForConditionalGeneration.from_pretrained(
-        'facebook/bart-large-cnn')
-    tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
-    inputs = tokenizer([text],
-                       max_length=1024, return_tensors='pt')
+    import transformers
+
+    model = transformers.BartForConditionalGeneration.from_pretrained(
+        "facebook/bart-large-cnn"
+    )
+    tokenizer = transformers.BartTokenizer.from_pretrained("facebook/bart-large-cnn")
+    inputs = tokenizer([text], max_length=1024, return_tensors="pt")
     summary_ids = model.generate(
-        inputs['input_ids'], num_beams=4, max_length=5, early_stopping=True)
-    return([tokenizer.decode(g,
-                             skip_special_tokens=True,
-                             clean_up_tokenization_spaces=False)
-            for g in summary_ids])
+        inputs["input_ids"], num_beams=4, max_length=5, early_stopping=True
+    )
+    return [
+        tokenizer.decode(
+            g, skip_special_tokens=True, clean_up_tokenization_spaces=False
+        )
+        for g in summary_ids
+    ]
 
 
 def textToAudio(text: str) -> None:
     """Transform the given text into audio."""
     tts = gTTS(text)
-    tts.save(time.today().strftime("%b-%d-%Y-%M-%S-%f")+".mp3")
+    tts.save(time.today().strftime("%b-%d-%Y-%M-%S-%f") + ".mp3")
 
 
 def singleLinkMode(argparser: Argparser) -> dict:
-    """runs the single-link main function"""
+    """Runs the single-link main function."""
     if argparser.args.sourcetype == "html":
         parser = build(argparser.args.source)
         for article in parser.articles:
@@ -243,7 +285,7 @@ def searchWikipedia(argparser: Argparser) -> str:
         "namespace": "0",
         "search": argparser.args.search,
         "limit": "10",
-        "format": "json"
+        "format": "json",
     }
     res = getWithParams(WIKIPEDIA_SEARCH_URL, searchParmas)
     print(res)