diff options
Diffstat (limited to 'devourer.py')
-rwxr-xr-x | devourer.py | 148 |
1 files changed, 95 insertions, 53 deletions
diff --git a/devourer.py b/devourer.py index 9188d09..94358dd 100755 --- a/devourer.py +++ b/devourer.py @@ -9,6 +9,8 @@ import tika import docker import os import nltk +import random +import string from newspaper import Article, build, Config from bs4 import BeautifulSoup from contextlib import closing @@ -26,32 +28,60 @@ WIKIPEDIA_SEARCH_URL = "https://en.wikipedia.org/w/api.php" class Argparser(object): def __init__(self): parser = argparse.ArgumentParser() - parser.add_argument("--source", - type=str, help="the url where the \ - urls to be extracted reside") - parser.add_argument("--out", type=str, - help="the output file", default="") - parser.add_argument("--singlelink", action="store_true", - help="whether the app should work in single-link \ + parser.add_argument( + "--source", + type=str, + help="the url where the \ + urls to be extracted reside", + default="", + ) + parser.add_argument( + "--out", + type=str, + help="the output file name if it applies", + default="", + ) + parser.add_argument( + "--singlelink", + action="store_true", + help="whether the app should work in single-link \ meaning only one page's contents will be used \ - mode", default=False) - parser.add_argument("--multilink", action="store_true", - help="whether the app should work in multi-link \ + mode", + default=False, + ) + parser.add_argument( + "--multilink", + action="store_true", + help="whether the app should work in multi-link \ mode meaning the srouce contians a list of links \ rather than being the actual source itself", - default=False) - parser.add_argument("--sourcetype", type=str, - help="determines the type of the \ - source.html,text,...") - parser.add_argument("--pdftomp3", action="store_true", - default=False, help="convert pdf to mp3. \ + default=False, + ) + parser.add_argument( + "--sourcetype", + type=str, + help="determines the type of the \ + source:html,text,...", + default="html", + ) + parser.add_argument( + "--pdftomp3", + action="store_true", + default=False, + help="convert pdf to mp3. \ source should be the path to a pdf file and\ - out should be the path to the mp3 output file") - parser.add_argument("--summary", type=str, default="newspaper", - help="which summary type to use. currently we \ - have newspaper, bart and none.") - parser.add_argument("--search", type=str, - default="", help="the search query") + out should be the path to the mp3 output file", + ) + parser.add_argument( + "--summary", + type=str, + default="newspaper", + help="which summary type to use. currently we \ + have newspaper, bart and none.", + ) + parser.add_argument( + "--search", type=str, default="", help="the string to search for" + ) self.args = parser.parse_args() @@ -63,9 +93,8 @@ def logError(err: RequestException) -> None: def isAGoodResponse(resp: Response) -> bool: """Checks whether the get we sent got a 200 response.""" - content_type = resp.headers['Content-Type'].lower() - return (resp.status_code == 200 and - content_type is not None) + content_type = resp.headers["Content-Type"].lower() + return resp.status_code == 200 and content_type is not None def simpleGet(url: str) -> bytes: @@ -94,23 +123,32 @@ def getWithParams(url: str, params: dict) -> dict: return None +def getRandStr(n): + """Return a random string of the given length.""" + return "".join([random.choice(string.lowercase) for i in range(n)]) + + def getURLS(source: str) -> dict: """Extracts the urls from a website.""" result = dict() raw_ml = simpleGet(source) ml = BeautifulSoup(raw_ml, "lxml") + + rand_tmp = "/tmp/" + getRandStr(20) ml_str = repr(ml) - tmp = open("/tmp/riecher", "w") + tmp = open(rand_tmp, "w") tmp.write(ml_str) tmp.close() - tmp = open("/tmp/riecher", "r") - dump_list = [] + tmp = open(rand_tmp, "r") + url_list = [] for line in tmp: - dummy = findall( - 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*(),]|' - r'(?:%[0-9a-fA-F][0-9a-fA-F]))+', line) - dump_list += dummy - for elem in dump_list: + url = findall( + "http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*(),]|" + r"(?:%[0-9a-fA-F][0-9a-fA-F]))+", + line, + ) + url_list += url + for elem in url_list: result[elem] = elem tmp.close() return result @@ -138,16 +176,16 @@ def pdfToVoice(argparser: Argparser) -> None: TIKA_SERVER_ENDPOINT = "127.0.0.1:9977" os.environ["TIKA_SERVER_ENDPOINT"] = TIKA_SERVER_ENDPOINT dockerClient = docker.from_env() - container = dockerClient.containers.run("apache/tika:2.0.0", detach=True, - ports={TIKA_SERVER_ENDPOINT: - "9998"}) + container = dockerClient.containers.run( + "apache/tika:2.0.0", detach=True, ports={TIKA_SERVER_ENDPOINT: "9998"} + ) while True: resp = get("http://127.0.0.1:9977") if resp.status_code == 200: break - time.sleep(.5) + time.sleep(0.5) rawText = tika.parser.from_file() - tts = gTTS(rawText['content']) + tts = gTTS(rawText["content"]) tts.save(argparser.args.out) container.stop() dockerClient.close() @@ -155,7 +193,7 @@ def pdfToVoice(argparser: Argparser) -> None: def extractRequirements(textBody: str) -> list: """Extract the sentences containing the keywords - that denote a requirement.""" + that denote a requirement.""" result = [] REQ_KEYWORDS = ["shall", "should", "must", "may", "can", "could"] nltk.download("punkt") @@ -169,28 +207,32 @@ def extractRequirements(textBody: str) -> list: def summarizeText(text: str) -> str: """Summarize the given text using bart.""" - from transformers import BartTokenizer, BartForConditionalGeneration - model = BartForConditionalGeneration.from_pretrained( - 'facebook/bart-large-cnn') - tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn') - inputs = tokenizer([text], - max_length=1024, return_tensors='pt') + import transformers + + model = transformers.BartForConditionalGeneration.from_pretrained( + "facebook/bart-large-cnn" + ) + tokenizer = transformers.BartTokenizer.from_pretrained("facebook/bart-large-cnn") + inputs = tokenizer([text], max_length=1024, return_tensors="pt") summary_ids = model.generate( - inputs['input_ids'], num_beams=4, max_length=5, early_stopping=True) - return([tokenizer.decode(g, - skip_special_tokens=True, - clean_up_tokenization_spaces=False) - for g in summary_ids]) + inputs["input_ids"], num_beams=4, max_length=5, early_stopping=True + ) + return [ + tokenizer.decode( + g, skip_special_tokens=True, clean_up_tokenization_spaces=False + ) + for g in summary_ids + ] def textToAudio(text: str) -> None: """Transform the given text into audio.""" tts = gTTS(text) - tts.save(time.today().strftime("%b-%d-%Y-%M-%S-%f")+".mp3") + tts.save(time.today().strftime("%b-%d-%Y-%M-%S-%f") + ".mp3") def singleLinkMode(argparser: Argparser) -> dict: - """runs the single-link main function""" + """Runs the single-link main function.""" if argparser.args.sourcetype == "html": parser = build(argparser.args.source) for article in parser.articles: @@ -243,7 +285,7 @@ def searchWikipedia(argparser: Argparser) -> str: "namespace": "0", "search": argparser.args.search, "limit": "10", - "format": "json" + "format": "json", } res = getWithParams(WIKIPEDIA_SEARCH_URL, searchParmas) print(res) |