From 45f45cd9f76af87fc848551a555f02c49dc7f75b Mon Sep 17 00:00:00 2001 From: terminaldweller Date: Thu, 7 Oct 2021 02:00:13 +0330 Subject: WIP --- devourer.py | 269 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ main.py | 199 ------------------------------------------ pyproject.toml | 3 + 3 files changed, 272 insertions(+), 199 deletions(-) create mode 100755 devourer.py delete mode 100755 main.py diff --git a/devourer.py b/devourer.py new file mode 100755 index 0000000..863a8b0 --- /dev/null +++ b/devourer.py @@ -0,0 +1,269 @@ +#!/usr/bin/env python3 +# _*_ coding=utf-8 _*_ + +import argparse +import logging +import subprocess +import sys +import tika +import docker +import os +import nltk +from newspaper import Article, build, Config +from bs4 import BeautifulSoup +from contextlib import closing +from requests import get, Response +from requests.exceptions import RequestException +from re import findall +from readability import Document +from gtts import gTTS +from datetime import datetime as time + + +WIKIPEDIA_SEARCH_URL = "https://en.wikipedia.org/w/api.php" + + +class Argparser(object): + def __init__(self): + parser = argparse.ArgumentParser() + parser.add_argument("--source", + type=str, help="the url where the \ + urls to be extracted reside") + parser.add_argument("--out", type=str, + help="the output file", default="") + parser.add_argument("--singlelink", action="store_true", + help="whether the app should work in single-link \ + meaning only one page's contents will be used \ + mode", default=False) + parser.add_argument("--multilink", action="store_true", + help="whether the app should work in multi-link \ + mode meaning the srouce contians a list of links \ + rather than being the actual source itself", + default=False) + parser.add_argument("--sourcetype", type=str, + help="determines the type of the \ + source.html,text,...") + parser.add_argument("--pdftomp3", action="store_true", + default=False, help="convert pdf to mp3. \ + source should be the path to a pdf file and\ + out should be the path to the mp3 output file") + parser.add_argument("--summary", type=str, default="newspaper", + help="which summary type to use. currently we \ + have newspaper, bart and none.") + parser.add_argument("--search", type=str, + default="", help="the search query") + self.args = parser.parse_args() + + +# FIXME-maybe actually really do some logging +def logError(err: RequestException) -> None: + """logs the errors""" + logging.exception(err) + + +def isAGoodResponse(resp: Response) -> bool: + """checks whether the get we sent got a 200 response""" + content_type = resp.headers['Content-Type'].lower() + return (resp.status_code == 200 and + content_type is not None) + + +def simpleGet(url: str) -> bytes: + """issues a simple get request""" + try: + with closing(get(url, stream=True)) as resp: + if isAGoodResponse(resp): + return resp.content + else: + return None + except RequestException as e: + logError("Error during requests to {0} : {1}".format(url, str(e))) + return None + + +def getWithParams(url: str, params: dict) -> dict: + """issues a get requesti with params""" + try: + with closing(get(url, params=params, stream=True)) as resp: + if isAGoodResponse(resp): + return resp.json() + else: + return None + except RequestException as e: + logError("Error during requests to {0} : {1}".format(url, str(e))) + return None + + +def getURLS(source: str) -> dict: + """extracts the urls from a website""" + result = dict() + raw_ml = simpleGet(source) + ml = BeautifulSoup(raw_ml, "lxml") + ml_str = repr(ml) + tmp = open("/tmp/riecher", "w") + tmp.write(ml_str) + tmp.close() + tmp = open("/tmp/riecher", "r") + dump_list = [] + for line in tmp: + dummy = findall( + 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*(),]|' + r'(?:%[0-9a-fA-F][0-9a-fA-F]))+', line) + dump_list += dummy + for elem in dump_list: + result[elem] = elem + tmp.close() + return result + + +def configNews(config: Config) -> None: + """configures newspaper""" + config.fetch_images = False + config.keep_article_html = True + config.memoize_articles = False + config.browser_user_agent = "Chrome/91.0.4464.5" + + +def call_from_shell_list(command_list: list): + """run a shell command given a list of command/arguments""" + # TODO-should probably deprecate this at some point + if sys.version_info < (3, 7): + return subprocess.run(command_list, stdout=subprocess.PIPE) + else: + return subprocess.run(command_list, capture_output=True) + + +def pdfToVoice(argparser: Argparser) -> None: + """main function for converting a pdf to an mp3""" + TIKA_SERVER_ENDPOINT = "127.0.0.1:9977" + os.environ["TIKA_SERVER_ENDPOINT"] = TIKA_SERVER_ENDPOINT + dockerClient = docker.from_env() + container = dockerClient.containers.run("apache/tika:2.0.0", detach=True, + ports={TIKA_SERVER_ENDPOINT: + "9998"}) + while True: + resp = get("http://127.0.0.1:9977") + if resp.status_code == 200: + break + time.sleep(.5) + rawText = tika.parser.from_file() + tts = gTTS(rawText['content']) + tts.save(argparser.args.out) + container.stop() + dockerClient.close() + + +def extractRequirements(textBody: str) -> list: + """extract the sentences containing the keywords + that denote a requirement""" + result = [] + REQ_KEYWORDS = ["shall", "should", "must", "may", "can", "could"] + nltk.download("punkt") + sentences = nltk.sent_tokenize(textBody) + for sentence in sentences: + for keyword in REQ_KEYWORDS: + if sentence.find(keyword) >= 0: + result.append(sentence) + return result + + +def summarizeText(text: str) -> str: + """summarize the given text using bart""" + from transformers import BartTokenizer, BartForConditionalGeneration + model = BartForConditionalGeneration.from_pretrained( + 'facebook/bart-large-cnn') + tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn') + inputs = tokenizer([text], + max_length=1024, return_tensors='pt') + summary_ids = model.generate( + inputs['input_ids'], num_beams=4, max_length=5, early_stopping=True) + return([tokenizer.decode(g, + skip_special_tokens=True, + clean_up_tokenization_spaces=False) + for g in summary_ids]) + + +def textToAudio(text: str) -> None: + """transform the given text into audio""" + tts = gTTS(text) + tts.save(time.today().strftime("%b-%d-%Y-%M-%S-%f")+".mp3") + + +def singleLinkMode(argparser: Argparser) -> dict: + """runs the single-link main function""" + if argparser.args.sourcetype == "html": + parser = build(argparser.args.source) + for article in parser.articles: + a = Article(article.url) + try: + a.download() + a.parse() + doc = Document(a.html) + print(doc.summary()) + extractRequirements(doc.summary()) + except Exception as e: + logging.exception(e) + elif argparser.args.sourcetype == "text": + bytesText = simpleGet(argparser.args.source) + extractRequirements(bytesText.decode("utf-8")) + + +def summarizeLinkToAudio(argparser: Argparser) -> None: + """summarizes the text inside a given url into audio""" + try: + article = Article(argparser.args.source) + article.download() + article.parse() + if argparser.args.summary == "newspaper": + article.nlp() + textToAudio(article.summary) + elif argparser.args.summary == "none": + textToAudio(article.text) + elif argparser.args.summary == "bart": + textToAudio(summarizeText(article.text)) + else: + print("invalid option for summry type.") + except Exception as e: + logging.exception(e) + + +def summarizeLinksToAudio(argparser: Argparser) -> None: + """summarize a list of urls into audio files""" + config = Config() + configNews(config) + urls = getURLS(argparser.args.source) + for url in urls: + summarizeLinkToAudio(url) + + +def searchWikipedia(argparser: Argparser) -> str: + """search wikipedia for a string and return the url""" + searchParmas = { + "action": "opensearch", + "namespace": "0", + "search": argparser.args.search, + "limit": "10", + "format": "json" + } + res = getWithParams(WIKIPEDIA_SEARCH_URL, searchParmas) + print(res) + argparser.args.source = res[3][0] + summarizeLinkToAudio(argparser) + + +def main() -> None: + argparser = Argparser() + if argparser.args.singlelink: + summarizeLinkToAudio(argparser) + elif argparser.args.multilink: + summarizeLinksToAudio(argparser) + elif argparser.args.pdftomp3: + pdfToVoice(argparser) + elif argparser.args.search: + searchWikipedia(argparser) + else: + pass + + +if __name__ == "__main__": + main() diff --git a/main.py b/main.py deleted file mode 100755 index 8176293..0000000 --- a/main.py +++ /dev/null @@ -1,199 +0,0 @@ -#!/usr/bin/env python3 -# _*_ coding=utf-8 _*_ - -import argparse -import logging -import subprocess -import sys -import tika -import docker -import os -import nltk -from newspaper import Article, build, Config -from bs4 import BeautifulSoup -from contextlib import closing -from requests import get, Response -from requests.exceptions import RequestException -from re import findall -from readability import Document -from gtts import gTTS -from datetime import datetime as time - - -class Argparser(object): - def __init__(self): - parser = argparse.ArgumentParser() - parser.add_argument("--source", - type=str, help="the url where the \ - urls to be extracted reside") - parser.add_argument("--out", type=str, - help="the output file", default="") - parser.add_argument("--singlelink", action="store_true", - help="whether the app should work in single-link \ - meaning only one page's contents will be used \ - mode", default=False) - parser.add_argument("--multilink", action="store_true", - help="whether the app should work in multi-link \ - mode meaning the srouce contians a list of links \ - rather than being the actual source itself", - default=False) - parser.add_argument("--sourcetype", type=str, - help="determines the type of the \ - source.html,text,...") - parser.add_argument("--pdftomp3", action="store_true", - default=False, help="convert pdf to mp3. \ - source should be the path to a pdf file and\ - out should be the path to the mp3 output file") - self.args = parser.parse_args() - - -# FIXME-maybe actually really do some logging -def logError(err: RequestException) -> None: - """logs the errors""" - logging.exception(err) - - -def isAGoodResponse(resp: Response) -> bool: - """checks whether the get we sent got a 200 response""" - content_type = resp.headers['Content-Type'].lower() - return (resp.status_code == 200 and - content_type is not None) - - -def simpleGet(url: str) -> bytes: - """issues a simple get request to download a website""" - try: - with closing(get(url, stream=True)) as resp: - if isAGoodResponse(resp): - return resp.content - else: - return None - except RequestException as e: - logError("Error during requests to {0} : {1}".format(url, str(e))) - return None - - -def getURLS(source: str) -> dict: - """extracts the urls from a website""" - result = dict() - raw_ml = simpleGet(source) - ml = BeautifulSoup(raw_ml, "lxml") - ml_str = repr(ml) - tmp = open("/tmp/riecher", "w") - tmp.write(ml_str) - tmp.close() - tmp = open("/tmp/riecher", "r") - dump_list = [] - for line in tmp: - dummy = findall( - 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*(),]|' - r'(?:%[0-9a-fA-F][0-9a-fA-F]))+', line) - dump_list += dummy - for elem in dump_list: - result[elem] = elem - tmp.close() - return result - - -def configNews(config: Config) -> None: - """configures newspaper""" - config.fetch_images = False - config.keep_article_html = True - config.memoize_articles = False - config.browser_user_agent = "Chrome/91.0.4464.5" - - -def call_from_shell_list(command_list): - # should probably deprecate this at some point - if sys.version_info < (3, 7): - return subprocess.run(command_list, stdout=subprocess.PIPE) - else: - return subprocess.run(command_list, capture_output=True) - - -def pdfToVoice(argparser: Argparser) -> None: - """main function for converting a pdf to an mp3""" - TIKA_SERVER_ENDPOINT = "127.0.0.1:9977" - os.environ["TIKA_SERVER_ENDPOINT"] = TIKA_SERVER_ENDPOINT - dockerClient = docker.from_env() - container = dockerClient.containers.run("apache/tika:2.0.0", detach=True, - ports={TIKA_SERVER_ENDPOINT: - "9998"}) - while True: - resp = get("http://127.0.0.1:9977") - if resp.status_code == 200: - break - time.sleep(.5) - rawText = tika.parser.from_file() - tts = gTTS(rawText['content']) - tts.save(argparser.args.out) - container.stop() - dockerClient.close() - - -def extractRequirements(textBody: str) -> list: - result = [] - REQ_KEYWORDS = ["shall", "should", "must", "may", "can", "could"] - nltk.download("punkt") - sentences = nltk.sent_tokenize(textBody) - for sentence in sentences: - for keyword in REQ_KEYWORDS: - if sentence.find(keyword) >= 0: - result.append(sentence) - return result - - -def singleLinkMode(argparser: Argparser) -> dict: - """runs the single-link main function""" - if argparser.args.sourcetype == "html": - parser = build(argparser.args.source) - for article in parser.articles: - a = Article(article.url) - try: - a.download() - a.parse() - doc = Document(a.html) - print(doc.summary()) - extractRequirements(doc.summary()) - except Exception as e: - logging.exception(e) - elif argparser.args.sourcetype == "text": - bytesText = simpleGet(argparser.args.source) - extractRequirements(bytesText.decode("utf-8")) - - -def multiLinkMode(argparser: Argparser) -> None: - """run the multi-link main function""" - config = Config() - configNews(config) - urls = getURLS(argparser.args.source) - for url in urls: - parser = build(url) - for article in parser.articles: - a = Article(article.url) - try: - a.download() - a.parse() - doc = Document(a.html) - print(doc.summary()) - if a.text != '': - tts = gTTS(a.text) - tts.save(time.today().strftime("%b-%d-%Y-%M-%S-%f")+".mp3") - except Exception as e: - logging.exception(e) - - -def main() -> None: - argparser = Argparser() - if argparser.args.singlelink: - singleLinkMode(argparser) - elif argparser.args.multilink: - multiLinkMode(argparser) - elif argparser.args.pdftomp3: - pdfToVoice(argparser) - else: - pass - - -if __name__ == "__main__": - main() diff --git a/pyproject.toml b/pyproject.toml index 1e09611..ca26b0f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,6 +14,9 @@ gtts = "^2.2.3" tika = "^1.24" docker = "^5.0.2" nltk = "^3.6.3" +tensorflow = "^2.6.0" +torch = "^1.9.1" +transformers = "^4.11.2" [tool.poetry.dev-dependencies] -- cgit v1.2.3