diff options
| -rw-r--r-- | Dockerfile | 2 | ||||
| -rwxr-xr-x | main.py | 111 | ||||
| -rw-r--r-- | pyproject.toml | 3 | 
3 files changed, 106 insertions, 10 deletions
@@ -23,5 +23,5 @@ FROM python-base as production  ENV FASTAPI_ENV=production  COPY --from=builder-base $VENV_PATH $VENV_PATH  COPY ./main.py $PYSETUP_PATH/main.py -ENTRYPOINT $PYSETUP_PATH/main.py +ENTRYPOINT ["$PYSETUP_PATH/main.py"]  # CMD ["--source", "https://github.com/coinpride/CryptoList"] @@ -3,6 +3,12 @@  import argparse  import logging +import subprocess +import sys +import tika +import docker +import os +import nltk  from newspaper import Article, build, Config  from bs4 import BeautifulSoup  from contextlib import closing @@ -17,15 +23,31 @@ from datetime import datetime as time  class Argparser(object):      def __init__(self):          parser = argparse.ArgumentParser() -        parser.add_argument( -            "--source", -            type=str, help="the url where the urls to be extracted reside") -        parser.add_argument("--bool", action="store_true", -                            help="bool", default=False) +        parser.add_argument("--source", +                            type=str, help="the url where the \ +                            urls to be extracted reside") +        parser.add_argument("--out", type=str, +                            help="the output file", default="") +        parser.add_argument("--singlelink", action="store_true", +                            help="whether the app should work in single-link \ +                            meaning only one page's contents will be used \ +                            mode", default=False) +        parser.add_argument("--multilink", action="store_true", +                            help="whether the app should work in multi-link \ +                            mode meaning the srouce contians a list of links \ +                            rather than being the actual source itself", +                            default=False) +        parser.add_argument("--sourcetype", type=str, +                            help="determines the type of the \ +                            source.html,text,...") +        parser.add_argument("--pdftomp3", action="store_true", +                            default=False, help="convert pdf to mp3. \ +                            source should be the path to a pdf file and\ +                            out should be the path to the mp3 output file")          self.args = parser.parse_args() -# TODO-maybe actually really do some logging +# FIXME-maybe actually really do some logging  def logError(err: RequestException) -> None:      """logs the errors"""      logging.exception(err) @@ -35,7 +57,7 @@ def isAGoodResponse(resp: Response) -> bool:      """checks whether the get we sent got a 200 response"""      content_type = resp.headers['Content-Type'].lower()      return (resp.status_code == 200 and -            content_type is not None and content_type.find("html") > -1) +            content_type is not None)  def simpleGet(url: str) -> bytes: @@ -81,8 +103,67 @@ def configNews(config: Config) -> None:      config.browser_user_agent = "Chrome/91.0.4464.5" -def main() -> None: -    argparser = Argparser() +def call_from_shell_list(command_list): +    # should probably deprecate this at some point +    if sys.version_info < (3, 7): +        return subprocess.run(command_list, stdout=subprocess.PIPE) +    else: +        return subprocess.run(command_list, capture_output=True) + + +def pdfToVoice(argparser: Argparser) -> None: +    """main function for converting a pdf to an mp3""" +    TIKA_SERVER_ENDPOINT = "127.0.0.1:9977" +    os.environ["TIKA_SERVER_ENDPOINT"] = TIKA_SERVER_ENDPOINT +    dockerClient = docker.from_env() +    container = dockerClient.containers.run("apache/tika:2.0.0", detach=True, +                                            ports={TIKA_SERVER_ENDPOINT: +                                                   "9998"}) +    while True: +        resp = get("http://127.0.0.1:9977") +        if resp.status_code == 200: +            break +        time.sleep(.5) +    rawText = tika.parser.from_file() +    tts = gTTS(rawText['content']) +    tts.save(argparser.args.out) +    container.stop() +    dockerClient.close() + + +def extractRequirements(textBody: str) -> list: +    result = [] +    REQ_KEYWORDS = ["shall", "should", "must", "may", "can", "could"] +    nltk.download("punkt") +    sentences = nltk.sent_tokenize(textBody) +    for sentence in sentences: +        for keyword in REQ_KEYWORDS: +            if sentence.find(keyword) >= 0: +                result.append(sentence) +    return result + + +def singleLinkMode(argparser: Argparser) -> dict: +    """runs the single-link main function""" +    if argparser.args.sourcetype == "html": +        parser = build(argparser.args.source) +        for article in parser.articles: +            a = Article(article.url) +            try: +                a.download() +                a.parse() +                doc = Document(a.html) +                print(doc.summary()) +                extractRequirements(doc.summary()) +            except Exception as e: +                logging.exception(e) +    elif argparser.args.sourcetype == "text": +        bytesText = simpleGet(argparser.args.source) +        extractRequirements(bytesText.decode("utf-8")) + + +def multiLinkMode(argparser: Argparser) -> None: +    """run the multi-link main function"""      config = Config()      configNews(config)      urls = getURLS(argparser.args.source) @@ -102,5 +183,17 @@ def main() -> None:                  logging.exception(e) +def main() -> None: +    argparser = Argparser() +    if argparser.args.singlelink: +        singleLinkMode(argparser) +    elif argparser.args.multilink: +        multiLinkMode(argparser) +    elif argparser.args.pdftomp3: +        pdfToVoice(argparser) +    else: +        pass + +  if __name__ == "__main__":      main() diff --git a/pyproject.toml b/pyproject.toml index 8eaddf2..1e09611 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,6 +11,9 @@ newspaper3k = "^0.2.8"  beautifulsoup4 = "^4.9.3"  readability-lxml = "^0.8.1"  gtts = "^2.2.3" +tika = "^1.24" +docker = "^5.0.2" +nltk = "^3.6.3"  [tool.poetry.dev-dependencies]  | 
