WIP

author: terminaldweller <thabogre@gmail.com> 2021-10-06 22:30:13 +0000
committer: terminaldweller <thabogre@gmail.com> 2021-10-06 22:30:13 +0000
commit: 45f45cd9f76af87fc848551a555f02c49dc7f75b (patch)
tree: 04467c054964fe231239b57c883522a058b8f25b /devourer.py
parent: WIP (diff)
download: devourer-45f45cd9f76af87fc848551a555f02c49dc7f75b.tar.gz
devourer-45f45cd9f76af87fc848551a555f02c49dc7f75b.zip
1 files changed, 90 insertions, 20 deletions
diff --git a/main.py b/devourer.py
index 8176293..863a8b0 100755
--- a/main.py
+++ b/devourer.py
@@ -20,6 +20,9 @@ from gtts import gTTS
 from datetime import datetime as time
 
 
+WIKIPEDIA_SEARCH_URL = "https://en.wikipedia.org/w/api.php"
+
+
 class Argparser(object):
     def __init__(self):
         parser = argparse.ArgumentParser()
@@ -44,6 +47,11 @@ class Argparser(object):
                             default=False, help="convert pdf to mp3. \
                             source should be the path to a pdf file and\
                             out should be the path to the mp3 output file")
+        parser.add_argument("--summary", type=str, default="newspaper",
+                            help="which summary type to use. currently we \
+                            have newspaper, bart and none.")
+        parser.add_argument("--search", type=str,
+                            default="", help="the search query")
         self.args = parser.parse_args()
 
 
@@ -61,7 +69,7 @@ def isAGoodResponse(resp: Response) -> bool:
 
 
 def simpleGet(url: str) -> bytes:
-    """issues a simple get request to download a website"""
+    """issues a simple get request"""
     try:
         with closing(get(url, stream=True)) as resp:
             if isAGoodResponse(resp):
@@ -73,6 +81,19 @@ def simpleGet(url: str) -> bytes:
         return None
 
 
+def getWithParams(url: str, params: dict) -> dict:
+    """issues a get requesti with params"""
+    try:
+        with closing(get(url, params=params, stream=True)) as resp:
+            if isAGoodResponse(resp):
+                return resp.json()
+            else:
+                return None
+    except RequestException as e:
+        logError("Error during requests to {0} : {1}".format(url, str(e)))
+        return None
+
+
 def getURLS(source: str) -> dict:
     """extracts the urls from a website"""
     result = dict()
@@ -103,8 +124,9 @@ def configNews(config: Config) -> None:
     config.browser_user_agent = "Chrome/91.0.4464.5"
 
 
-def call_from_shell_list(command_list):
-    # should probably deprecate this at some point
+def call_from_shell_list(command_list: list):
+    """run a shell command given a list of command/arguments"""
+    # TODO-should probably deprecate this at some point
     if sys.version_info < (3, 7):
         return subprocess.run(command_list, stdout=subprocess.PIPE)
     else:
@@ -132,6 +154,8 @@ def pdfToVoice(argparser: Argparser) -> None:
 
 
 def extractRequirements(textBody: str) -> list:
+    """extract the sentences containing the keywords
+     that denote a requirement"""
     result = []
     REQ_KEYWORDS = ["shall", "should", "must", "may", "can", "could"]
     nltk.download("punkt")
@@ -143,6 +167,28 @@ def extractRequirements(textBody: str) -> list:
     return result
 
 
+def summarizeText(text: str) -> str:
+    """summarize the given text using bart"""
+    from transformers import BartTokenizer, BartForConditionalGeneration
+    model = BartForConditionalGeneration.from_pretrained(
+        'facebook/bart-large-cnn')
+    tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
+    inputs = tokenizer([text],
+                       max_length=1024, return_tensors='pt')
+    summary_ids = model.generate(
+        inputs['input_ids'], num_beams=4, max_length=5, early_stopping=True)
+    return([tokenizer.decode(g,
+                             skip_special_tokens=True,
+                             clean_up_tokenization_spaces=False)
+            for g in summary_ids])
+
+
+def textToAudio(text: str) -> None:
+    """transform the given text into audio"""
+    tts = gTTS(text)
+    tts.save(time.today().strftime("%b-%d-%Y-%M-%S-%f")+".mp3")
+
+
 def singleLinkMode(argparser: Argparser) -> dict:
     """runs the single-link main function"""
     if argparser.args.sourcetype == "html":
@@ -162,35 +208,59 @@ def singleLinkMode(argparser: Argparser) -> dict:
         extractRequirements(bytesText.decode("utf-8"))
 
 
-def multiLinkMode(argparser: Argparser) -> None:
-    """run the multi-link main function"""
+def summarizeLinkToAudio(argparser: Argparser) -> None:
+    """summarizes the text inside a given url into audio"""
+    try:
+        article = Article(argparser.args.source)
+        article.download()
+        article.parse()
+        if argparser.args.summary == "newspaper":
+            article.nlp()
+            textToAudio(article.summary)
+        elif argparser.args.summary == "none":
+            textToAudio(article.text)
+        elif argparser.args.summary == "bart":
+            textToAudio(summarizeText(article.text))
+        else:
+            print("invalid option for summry type.")
+    except Exception as e:
+        logging.exception(e)
+
+
+def summarizeLinksToAudio(argparser: Argparser) -> None:
+    """summarize a list of urls into audio files"""
     config = Config()
     configNews(config)
     urls = getURLS(argparser.args.source)
     for url in urls:
-        parser = build(url)
-        for article in parser.articles:
-            a = Article(article.url)
-            try:
-                a.download()
-                a.parse()
-                doc = Document(a.html)
-                print(doc.summary())
-                if a.text != '':
-                    tts = gTTS(a.text)
-                    tts.save(time.today().strftime("%b-%d-%Y-%M-%S-%f")+".mp3")
-            except Exception as e:
-                logging.exception(e)
+        summarizeLinkToAudio(url)
+
+
+def searchWikipedia(argparser: Argparser) -> str:
+    """search wikipedia for a string and return the url"""
+    searchParmas = {
+        "action": "opensearch",
+        "namespace": "0",
+        "search": argparser.args.search,
+        "limit": "10",
+        "format": "json"
+    }
+    res = getWithParams(WIKIPEDIA_SEARCH_URL, searchParmas)
+    print(res)
+    argparser.args.source = res[3][0]
+    summarizeLinkToAudio(argparser)
 
 
 def main() -> None:
     argparser = Argparser()
     if argparser.args.singlelink:
-        singleLinkMode(argparser)
+        summarizeLinkToAudio(argparser)
     elif argparser.args.multilink:
-        multiLinkMode(argparser)
+        summarizeLinksToAudio(argparser)
     elif argparser.args.pdftomp3:
         pdfToVoice(argparser)
+    elif argparser.args.search:
+        searchWikipedia(argparser)
     else:
         pass
author	terminaldweller <thabogre@gmail.com>	2021-10-06 22:30:13 +0000
committer	terminaldweller <thabogre@gmail.com>	2021-10-06 22:30:13 +0000
commit	45f45cd9f76af87fc848551a555f02c49dc7f75b (patch)
tree	04467c054964fe231239b57c883522a058b8f25b /devourer.py
parent	WIP (diff)
download	devourer-45f45cd9f76af87fc848551a555f02c49dc7f75b.tar.gz devourer-45f45cd9f76af87fc848551a555f02c49dc7f75b.zip