From 96b963fc1ab6e6d20581908e19e36cd01f2cf47b Mon Sep 17 00:00:00 2001 From: terminaldweller Date: Fri, 24 Sep 2021 13:00:08 +0330 Subject: WIP --- Dockerfile | 2 +- main.py | 111 ++++++++++++++++++++++++++++++++++++++++++++++++++++----- pyproject.toml | 3 ++ 3 files changed, 106 insertions(+), 10 deletions(-) diff --git a/Dockerfile b/Dockerfile index 558380c..7e5641c 100644 --- a/Dockerfile +++ b/Dockerfile @@ -23,5 +23,5 @@ FROM python-base as production ENV FASTAPI_ENV=production COPY --from=builder-base $VENV_PATH $VENV_PATH COPY ./main.py $PYSETUP_PATH/main.py -ENTRYPOINT $PYSETUP_PATH/main.py +ENTRYPOINT ["$PYSETUP_PATH/main.py"] # CMD ["--source", "https://github.com/coinpride/CryptoList"] diff --git a/main.py b/main.py index ef89ca1..8176293 100755 --- a/main.py +++ b/main.py @@ -3,6 +3,12 @@ import argparse import logging +import subprocess +import sys +import tika +import docker +import os +import nltk from newspaper import Article, build, Config from bs4 import BeautifulSoup from contextlib import closing @@ -17,15 +23,31 @@ from datetime import datetime as time class Argparser(object): def __init__(self): parser = argparse.ArgumentParser() - parser.add_argument( - "--source", - type=str, help="the url where the urls to be extracted reside") - parser.add_argument("--bool", action="store_true", - help="bool", default=False) + parser.add_argument("--source", + type=str, help="the url where the \ + urls to be extracted reside") + parser.add_argument("--out", type=str, + help="the output file", default="") + parser.add_argument("--singlelink", action="store_true", + help="whether the app should work in single-link \ + meaning only one page's contents will be used \ + mode", default=False) + parser.add_argument("--multilink", action="store_true", + help="whether the app should work in multi-link \ + mode meaning the srouce contians a list of links \ + rather than being the actual source itself", + default=False) + parser.add_argument("--sourcetype", type=str, + help="determines the type of the \ + source.html,text,...") + parser.add_argument("--pdftomp3", action="store_true", + default=False, help="convert pdf to mp3. \ + source should be the path to a pdf file and\ + out should be the path to the mp3 output file") self.args = parser.parse_args() -# TODO-maybe actually really do some logging +# FIXME-maybe actually really do some logging def logError(err: RequestException) -> None: """logs the errors""" logging.exception(err) @@ -35,7 +57,7 @@ def isAGoodResponse(resp: Response) -> bool: """checks whether the get we sent got a 200 response""" content_type = resp.headers['Content-Type'].lower() return (resp.status_code == 200 and - content_type is not None and content_type.find("html") > -1) + content_type is not None) def simpleGet(url: str) -> bytes: @@ -81,8 +103,67 @@ def configNews(config: Config) -> None: config.browser_user_agent = "Chrome/91.0.4464.5" -def main() -> None: - argparser = Argparser() +def call_from_shell_list(command_list): + # should probably deprecate this at some point + if sys.version_info < (3, 7): + return subprocess.run(command_list, stdout=subprocess.PIPE) + else: + return subprocess.run(command_list, capture_output=True) + + +def pdfToVoice(argparser: Argparser) -> None: + """main function for converting a pdf to an mp3""" + TIKA_SERVER_ENDPOINT = "127.0.0.1:9977" + os.environ["TIKA_SERVER_ENDPOINT"] = TIKA_SERVER_ENDPOINT + dockerClient = docker.from_env() + container = dockerClient.containers.run("apache/tika:2.0.0", detach=True, + ports={TIKA_SERVER_ENDPOINT: + "9998"}) + while True: + resp = get("http://127.0.0.1:9977") + if resp.status_code == 200: + break + time.sleep(.5) + rawText = tika.parser.from_file() + tts = gTTS(rawText['content']) + tts.save(argparser.args.out) + container.stop() + dockerClient.close() + + +def extractRequirements(textBody: str) -> list: + result = [] + REQ_KEYWORDS = ["shall", "should", "must", "may", "can", "could"] + nltk.download("punkt") + sentences = nltk.sent_tokenize(textBody) + for sentence in sentences: + for keyword in REQ_KEYWORDS: + if sentence.find(keyword) >= 0: + result.append(sentence) + return result + + +def singleLinkMode(argparser: Argparser) -> dict: + """runs the single-link main function""" + if argparser.args.sourcetype == "html": + parser = build(argparser.args.source) + for article in parser.articles: + a = Article(article.url) + try: + a.download() + a.parse() + doc = Document(a.html) + print(doc.summary()) + extractRequirements(doc.summary()) + except Exception as e: + logging.exception(e) + elif argparser.args.sourcetype == "text": + bytesText = simpleGet(argparser.args.source) + extractRequirements(bytesText.decode("utf-8")) + + +def multiLinkMode(argparser: Argparser) -> None: + """run the multi-link main function""" config = Config() configNews(config) urls = getURLS(argparser.args.source) @@ -102,5 +183,17 @@ def main() -> None: logging.exception(e) +def main() -> None: + argparser = Argparser() + if argparser.args.singlelink: + singleLinkMode(argparser) + elif argparser.args.multilink: + multiLinkMode(argparser) + elif argparser.args.pdftomp3: + pdfToVoice(argparser) + else: + pass + + if __name__ == "__main__": main() diff --git a/pyproject.toml b/pyproject.toml index 8eaddf2..1e09611 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,6 +11,9 @@ newspaper3k = "^0.2.8" beautifulsoup4 = "^4.9.3" readability-lxml = "^0.8.1" gtts = "^2.2.3" +tika = "^1.24" +docker = "^5.0.2" +nltk = "^3.6.3" [tool.poetry.dev-dependencies] -- cgit v1.2.3