diff options
author | terminaldweller <thabogre@gmail.com> | 2021-10-06 22:30:13 +0000 |
---|---|---|
committer | terminaldweller <thabogre@gmail.com> | 2021-10-06 22:30:13 +0000 |
commit | 45f45cd9f76af87fc848551a555f02c49dc7f75b (patch) | |
tree | 04467c054964fe231239b57c883522a058b8f25b | |
parent | WIP (diff) | |
download | devourer-45f45cd9f76af87fc848551a555f02c49dc7f75b.tar.gz devourer-45f45cd9f76af87fc848551a555f02c49dc7f75b.zip |
WIP
-rwxr-xr-x | devourer.py (renamed from main.py) | 110 | ||||
-rw-r--r-- | pyproject.toml | 3 |
2 files changed, 93 insertions, 20 deletions
@@ -20,6 +20,9 @@ from gtts import gTTS from datetime import datetime as time +WIKIPEDIA_SEARCH_URL = "https://en.wikipedia.org/w/api.php" + + class Argparser(object): def __init__(self): parser = argparse.ArgumentParser() @@ -44,6 +47,11 @@ class Argparser(object): default=False, help="convert pdf to mp3. \ source should be the path to a pdf file and\ out should be the path to the mp3 output file") + parser.add_argument("--summary", type=str, default="newspaper", + help="which summary type to use. currently we \ + have newspaper, bart and none.") + parser.add_argument("--search", type=str, + default="", help="the search query") self.args = parser.parse_args() @@ -61,7 +69,7 @@ def isAGoodResponse(resp: Response) -> bool: def simpleGet(url: str) -> bytes: - """issues a simple get request to download a website""" + """issues a simple get request""" try: with closing(get(url, stream=True)) as resp: if isAGoodResponse(resp): @@ -73,6 +81,19 @@ def simpleGet(url: str) -> bytes: return None +def getWithParams(url: str, params: dict) -> dict: + """issues a get requesti with params""" + try: + with closing(get(url, params=params, stream=True)) as resp: + if isAGoodResponse(resp): + return resp.json() + else: + return None + except RequestException as e: + logError("Error during requests to {0} : {1}".format(url, str(e))) + return None + + def getURLS(source: str) -> dict: """extracts the urls from a website""" result = dict() @@ -103,8 +124,9 @@ def configNews(config: Config) -> None: config.browser_user_agent = "Chrome/91.0.4464.5" -def call_from_shell_list(command_list): - # should probably deprecate this at some point +def call_from_shell_list(command_list: list): + """run a shell command given a list of command/arguments""" + # TODO-should probably deprecate this at some point if sys.version_info < (3, 7): return subprocess.run(command_list, stdout=subprocess.PIPE) else: @@ -132,6 +154,8 @@ def pdfToVoice(argparser: Argparser) -> None: def extractRequirements(textBody: str) -> list: + """extract the sentences containing the keywords + that denote a requirement""" result = [] REQ_KEYWORDS = ["shall", "should", "must", "may", "can", "could"] nltk.download("punkt") @@ -143,6 +167,28 @@ def extractRequirements(textBody: str) -> list: return result +def summarizeText(text: str) -> str: + """summarize the given text using bart""" + from transformers import BartTokenizer, BartForConditionalGeneration + model = BartForConditionalGeneration.from_pretrained( + 'facebook/bart-large-cnn') + tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn') + inputs = tokenizer([text], + max_length=1024, return_tensors='pt') + summary_ids = model.generate( + inputs['input_ids'], num_beams=4, max_length=5, early_stopping=True) + return([tokenizer.decode(g, + skip_special_tokens=True, + clean_up_tokenization_spaces=False) + for g in summary_ids]) + + +def textToAudio(text: str) -> None: + """transform the given text into audio""" + tts = gTTS(text) + tts.save(time.today().strftime("%b-%d-%Y-%M-%S-%f")+".mp3") + + def singleLinkMode(argparser: Argparser) -> dict: """runs the single-link main function""" if argparser.args.sourcetype == "html": @@ -162,35 +208,59 @@ def singleLinkMode(argparser: Argparser) -> dict: extractRequirements(bytesText.decode("utf-8")) -def multiLinkMode(argparser: Argparser) -> None: - """run the multi-link main function""" +def summarizeLinkToAudio(argparser: Argparser) -> None: + """summarizes the text inside a given url into audio""" + try: + article = Article(argparser.args.source) + article.download() + article.parse() + if argparser.args.summary == "newspaper": + article.nlp() + textToAudio(article.summary) + elif argparser.args.summary == "none": + textToAudio(article.text) + elif argparser.args.summary == "bart": + textToAudio(summarizeText(article.text)) + else: + print("invalid option for summry type.") + except Exception as e: + logging.exception(e) + + +def summarizeLinksToAudio(argparser: Argparser) -> None: + """summarize a list of urls into audio files""" config = Config() configNews(config) urls = getURLS(argparser.args.source) for url in urls: - parser = build(url) - for article in parser.articles: - a = Article(article.url) - try: - a.download() - a.parse() - doc = Document(a.html) - print(doc.summary()) - if a.text != '': - tts = gTTS(a.text) - tts.save(time.today().strftime("%b-%d-%Y-%M-%S-%f")+".mp3") - except Exception as e: - logging.exception(e) + summarizeLinkToAudio(url) + + +def searchWikipedia(argparser: Argparser) -> str: + """search wikipedia for a string and return the url""" + searchParmas = { + "action": "opensearch", + "namespace": "0", + "search": argparser.args.search, + "limit": "10", + "format": "json" + } + res = getWithParams(WIKIPEDIA_SEARCH_URL, searchParmas) + print(res) + argparser.args.source = res[3][0] + summarizeLinkToAudio(argparser) def main() -> None: argparser = Argparser() if argparser.args.singlelink: - singleLinkMode(argparser) + summarizeLinkToAudio(argparser) elif argparser.args.multilink: - multiLinkMode(argparser) + summarizeLinksToAudio(argparser) elif argparser.args.pdftomp3: pdfToVoice(argparser) + elif argparser.args.search: + searchWikipedia(argparser) else: pass diff --git a/pyproject.toml b/pyproject.toml index 1e09611..ca26b0f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,6 +14,9 @@ gtts = "^2.2.3" tika = "^1.24" docker = "^5.0.2" nltk = "^3.6.3" +tensorflow = "^2.6.0" +torch = "^1.9.1" +transformers = "^4.11.2" [tool.poetry.dev-dependencies] |