WIP

author: terminaldweller <thabogre@gmail.com> 2021-11-11 17:58:22 +0000
committer: terminaldweller <thabogre@gmail.com> 2021-11-11 17:58:22 +0000
commit: 6777413a7f9c8d9ab784e7a39dfb5f60e405cb7b (patch)
tree: 691ae46a355c35230cbba30c6f80e766a0d3158c
parent: update (diff)
download: devourer-6777413a7f9c8d9ab784e7a39dfb5f60e405cb7b.tar.gz
devourer-6777413a7f9c8d9ab784e7a39dfb5f60e405cb7b.zip
6 files changed, 261 insertions, 147 deletions
diff --git a/Dockerfile b/Dockerfile
index 7e5641c..3cf3409 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -19,9 +19,16 @@ WORKDIR $PYSETUP_PATH
 COPY ./pyproject.toml ./
 RUN poetry install --no-dev
 
+FROM node:lts-alpine3.13 AS certbuilder
+RUN apk add openssl
+WORKDIR /certs
+RUN openssl req -nodes -new -x509 -subj="/C=US/ST=Denial/L=springfield/O=Dis/CN=localhost" -keyout server.key -out server.cert
+
 FROM python-base as production
+RUN pip3 install uvicorn
+COPY --from=certbuilder /certs/ /certs
 ENV FASTAPI_ENV=production
 COPY --from=builder-base $VENV_PATH $VENV_PATH
-COPY ./main.py $PYSETUP_PATH/main.py
-ENTRYPOINT ["$PYSETUP_PATH/main.py"]
-# CMD ["--source", "https://github.com/coinpride/CryptoList"]
+COPY ./docker-entrypoint.sh /docker-entrypoint.sh
+COPY ./devourer.py $PYSETUP_PATH/devourer.py
+WORKDIR $PYSETUP_PATH
diff --git a/devourer.py b/devourer.py
index e869c4c..cc25206 100755..100644
--- a/devourer.py
+++ b/devourer.py
@@ -1,14 +1,11 @@
-#!/usr/bin/env python3
 # _*_ coding=utf-8 _*_
 
-import argparse
 import logging
 import tika
-import docker
-import os
 import nltk
 import random
 import string
+import os
 from newspaper import Article, build, Config
 from bs4 import BeautifulSoup
 from contextlib import closing
@@ -18,69 +15,7 @@ from re import findall
 from readability import Document
 from gtts import gTTS
 from datetime import datetime as time
-
-
-WIKIPEDIA_SEARCH_URL = "https://en.wikipedia.org/w/api.php"
-
-
-class Argparser(object):
-    def __init__(self):
-        parser = argparse.ArgumentParser()
-        parser.add_argument(
-            "--source",
-            type=str,
-            help="the url where the \
-                            urls to be extracted reside",
-            default="",
-        )
-        parser.add_argument(
-            "--out",
-            type=str,
-            help="the output file name if it applies",
-            default="",
-        )
-        parser.add_argument(
-            "--singlelink",
-            action="store_true",
-            help="whether the app should work in single-link \
-                            meaning only one page's contents will be used \
-                            mode",
-            default=False,
-        )
-        parser.add_argument(
-            "--multilink",
-            action="store_true",
-            help="whether the app should work in multi-link \
-                            mode meaning the srouce contians a list of links \
-                            rather than being the actual source itself",
-            default=False,
-        )
-        parser.add_argument(
-            "--sourcetype",
-            type=str,
-            help="determines the type of the \
-                            source:html,text,...",
-            default="html",
-        )
-        parser.add_argument(
-            "--pdftomp3",
-            action="store_true",
-            default=False,
-            help="convert pdf to mp3. \
-                            source should be the path to a pdf file and\
-                            out should be the path to the mp3 output file",
-        )
-        parser.add_argument(
-            "--summary",
-            type=str,
-            default="newspaper",
-            help="which summary type to use. currently we \
-                            have newspaper, bart and none.",
-        )
-        parser.add_argument(
-            "--search", type=str, default="", help="the string to search for"
-        )
-        self.args = parser.parse_args()
+from fastapi import FastAPI
 
 
 # FIXME-maybe actually really do some logging
@@ -109,7 +44,7 @@ def simpleGet(url: str) -> bytes:
 
 
 def getWithParams(url: str, params: dict) -> dict:
-    """Issues a get requesti with params."""
+    """Issues a get request with params."""
     try:
         with closing(get(url, params=params, stream=True)) as resp:
             if isAGoodResponse(resp):
@@ -160,24 +95,19 @@ def configNews(config: Config) -> None:
     config.browser_user_agent = "Chrome/91.0.4464.5"
 
 
-def pdfToVoice(argparser: Argparser) -> None:
+# FIXME-have to decide whether to use files or urls
+def pdfToVoice() -> str:
     """Main function for converting a pdf to an mp3."""
-    TIKA_SERVER_ENDPOINT = "127.0.0.1:9977"
-    os.environ["TIKA_SERVER_ENDPOINT"] = TIKA_SERVER_ENDPOINT
-    dockerClient = docker.from_env()
-    container = dockerClient.containers.run(
-        "apache/tika:2.0.0", detach=True, ports={TIKA_SERVER_ENDPOINT: "9998"}
-    )
-    while True:
-        resp = get("http://127.0.0.1:9977")
-        if resp.status_code == 200:
-            break
-        time.sleep(0.5)
-    rawText = tika.parser.from_file()
-    tts = gTTS(rawText["content"])
-    tts.save(argparser.args.out)
-    container.stop()
-    dockerClient.close()
+    outfile = str()
+    try:
+        rawText = tika.parser.from_file()
+        tts = gTTS(rawText["content"])
+        outfile = getRandStr(20) + ".mp3"
+        tts.save(outfile)
+    except Exception as e:
+        logging.exception(e)
+    finally:
+        return outfile
 
 
 def extractRequirements(textBody: str) -> list:
@@ -228,90 +158,204 @@ def summarizeText(text: str) -> str:
     ]
 
 
-def textToAudio(text: str) -> None:
+def textToAudio(text: str) -> str:
     """Transform the given text into audio."""
-    tts = gTTS(text)
-    tts.save(time.today().strftime("%b-%d-%Y-%M-%S-%f") + ".mp3")
+    try:
+        path = str()
+        path = (
+            os.environ["AUDIO_DUMP_DIR"]
+            + time.today().strftime("%b-%d-%Y-%M-%S-%f")
+            + ".mp3"
+        )
+        tts = gTTS(text)
+        tts.save(path)
+    except Exception as e:
+        logging.exception(e)
+    finally:
+        return path
 
 
-def singleLinkMode(argparser: Argparser) -> dict:
+def getRequirements(url: str, sourcetype: str) -> list:
     """Runs the single-link main function."""
-    if argparser.args.sourcetype == "html":
-        parser = build(argparser.args.source)
-        for article in parser.articles:
-            a = Article(article.url)
-            try:
+    result = str()
+    results = list()
+    try:
+        if sourcetype == "html":
+            parser = build(url)
+            for article in parser.articles:
+                a = Article(article.url)
                 a.download()
                 a.parse()
                 doc = Document(a.html)
-                print(doc.summary())
-                extractRequirements(doc.summary())
-            except Exception as e:
-                logging.exception(e)
-    elif argparser.args.sourcetype == "text":
-        bytesText = simpleGet(argparser.args.source)
-        extractRequirements(bytesText.decode("utf-8"))
+                # print(doc.summary())
+                results = extractRequirements(doc.summary())
+        elif sourcetype == "text":
+            bytesText = simpleGet(url)
+            results = extractRequirements(bytesText.decode("utf-8"))
+    except Exception as e:
+        logging.exception(e)
+    finally:
+        result = "".join(results + "\n")
+        return result
 
 
-def summarizeLinkToAudio(argparser: Argparser) -> None:
+def summarizeLinkToAudio(url, summary) -> str:
     """Summarizes the text inside a given url into audio."""
+    result = str()
     try:
-        article = Article(argparser.args.source)
+        article = Article(url)
         article.download()
         article.parse()
-        if argparser.args.summary == "newspaper":
+        if summary == "newspaper":
             article.nlp()
-            textToAudio(article.summary)
-        elif argparser.args.summary == "none":
-            textToAudio(article.text)
-        elif argparser.args.summary == "bart":
-            textToAudio(summarizeText(article.text))
+            result = article.summary
+        elif summary == "none":
+            result = article.text
+        elif summary == "bart":
+            result = article.text
         else:
-            print("invalid option for summry type.")
+            print("invalid option for summary type.")
+            result = None
     except Exception as e:
+        result = None
         logging.exception(e)
+    finally:
+        return result
 
 
-def summarizeLinksToAudio(argparser: Argparser) -> None:
+def summarizeLinksToAudio(url, summary) -> None:
     """Summarize a list of urls into audio files."""
-    config = Config()
-    configNews(config)
-    urls = getURLS(argparser.args.source)
-    for url in urls:
-        summarizeLinkToAudio(url)
+    results = list()
+    result = str()
+    try:
+        config = Config()
+        configNews(config)
+        urls = getURLS(url, summary)
+        for url in urls:
+            results.append(summarizeLinkToAudio(url))
+    except Exception as e:
+        logging.exception(e)
+    finally:
+        result = "".join(results)
+        return result
 
 
-def searchWikipedia(argparser: Argparser) -> str:
+def searchWikipedia(search_term: str) -> str:
     """Search wikipedia for a string and return the url.
 
     reference: https://www.mediawiki.org/wiki/API:Opensearch
     """
-    searchParmas = {
-        "action": "opensearch",
-        "namespace": "0",
-        "search": argparser.args.search,
-        "limit": "10",
-        "format": "json",
+    result = str()
+    try:
+        searchParmas = {
+            "action": "opensearch",
+            "namespace": "0",
+            "search": search_term,
+            "limit": "10",
+            "format": "json",
+        }
+        res = getWithParams(os.environ["WIKI_SEARCH_URL"], searchParmas)
+        # FIXME-handle wiki redirects/disambiguations
+        # argparser.args.source = res[3][0]
+        print(res)
+    except Exception as e:
+        logging.exception(e)
+    finally:
+        return result
+
+
+def getAudioFromFile(audio_path: str) -> str:
+    """Returns the contents of a file in binary format"""
+    with open(audio_path, "rb") as audio:
+        return audio.read()
+
+
+app = FastAPI()
+
+
+@app.get("/tika")
+async def pdf_to_audio_ep(url: str):
+    """turns a pdf into an audiofile"""
+    audio_path = pdfToVoice()
+    return {
+        "Content-Type": "application/json",
+        "isOK": True if audio_path != "" else False,
+        "audio": getAudioFromFile(audio_path) if audio_path != "" else "",
     }
-    res = getWithParams(WIKIPEDIA_SEARCH_URL, searchParmas)
-    print(res)
-    argparser.args.source = res[3][0]
-    summarizeLinkToAudio(argparser)
-
-
-def main() -> None:
-    argparser = Argparser()
-    if argparser.args.singlelink:
-        summarizeLinkToAudio(argparser)
-    elif argparser.args.multilink:
-        summarizeLinksToAudio(argparser)
-    elif argparser.args.pdftomp3:
-        pdfToVoice(argparser)
-    elif argparser.args.search:
-        searchWikipedia(argparser)
-    else:
-        pass
 
 
-if __name__ == "__main__":
-    main()
+@app.get("/reqs")
+async def extract_reqs_ep(url: str, sourcetype: str = "html"):
+    """extracts the requirements from a given url"""
+    result = getRequirements()
+    return {
+        "Content-Type": "application/json",
+        "isOK": True if result != "" else False,
+        "reqs": result,
+    }
+
+
+@app.get("/wiki")
+async def wiki_search_ep(term: str, audio: bool = False):
+    """search and summarizes from wikipedia"""
+    text = searchWikipedia(term)
+    if audio:
+        audio_path = textToAudio(text)
+        return {
+            "Content-Type": "application/json",
+            "isOK": (True if audio_path != "" else False)
+            and (True if text != "" else False),
+            "audio": getAudioFromFile(audio_path) if audio_path != "" else "",
+            "text": text,
+        }
+    else:
+        return {
+            "Content-Type": "application/json",
+            "isOK": True if text != "" else False,
+            "audio": "",
+            "text": text,
+        }
+
+
+@app.get("/summ")
+async def summarize_ep(url: str, summary: str = "none", audio: bool = False):
+    """summarize and turn the summary into audio"""
+    text = summarizeLinkToAudio(url, summary)
+    if audio:
+        audio_path = textToAudio(text)
+        return {
+            "Content-Type": "application/json",
+            "isOK": (True if audio_path != "" else False)
+            and (True if text != "" else False),
+            "audio": getAudioFromFile(audio_path) if audio_path != "" else "",
+            "text": text,
+        }
+    else:
+        return {
+            "Content-Type": "application/json",
+            "isOK": True if text != "" else False,
+            "audio": "",
+            "text": text,
+        }
+
+
+@app.get("/mila")
+async def mila_ep(url: str, summary: str = "newspaper", audio: bool = False):
+    """extract all the urls and then summarize and turn into audio"""
+    text = summarizeLinksToAudio(url, summary)
+    if audio:
+        audio_path = textToAudio(text)
+        return {
+            "Content-Type": "application/json",
+            "isOK": (True if audio_path != "" else False)
+            and (True if text != "" else False),
+            "audio": getAudioFromFile(audio_path) if audio_path != "" else "",
+            "text": text,
+        }
+    else:
+        return {
+            "Content-Type": "application/json",
+            "isOK": True if text != "" else False,
+            "audio": "",
+            "text": text,
+        }
diff --git a/docker-compose-test.yaml b/docker-compose-test.yaml
new file mode 100644
index 0000000..fea638b
--- /dev/null
+++ b/docker-compose-test.yaml
@@ -0,0 +1,28 @@
+version: "3.7"
+services:
+  devourer:
+    image: devourer
+    networks:
+      - mainnet
+      - tikanet
+    depends_on:
+      - tika
+    ports:
+      - "9009:80"
+    environment:
+      - TIKA_SERVER_ENDPOINT="tika:9998"
+      - AUDIO_DUMP_DIR="/tmp"
+      - WIKI_SEARCH_URL="https://en.wikipedia.org/w/api.php"
+      - SERVER_DEPLOYMENT_TYPE=test
+    cap_drop:
+      - ALL
+    entrypoint: ["/docker-entrypoint.sh"]
+  tika:
+    image: apache/tike:2.0.0
+    networks:
+      - tikanet
+    cap_drop:
+      - ALL
+networks:
+  mainnet:
+  tikanet:
diff --git a/docker-compose.yaml b/docker-compose.yaml
new file mode 100644
index 0000000..5a2bccf
--- /dev/null
+++ b/docker-compose.yaml
@@ -0,0 +1,28 @@
+version: "3.7"
+services:
+  devourer:
+    image: devourer
+    networks:
+      - mainnet
+      - tikanet
+    depends_on:
+      - tika
+    ports:
+      - "9009:80"
+    environment:
+      - TIKA_SERVER_ENDPOINT="tika:9998"
+      - AUDIO_DUMP_DIR="/tmp"
+      - WIKI_SEARCH_URL="https://en.wikipedia.org/w/api.php"
+      - SERVER_DEPLOYMENT_TYPE=deployment
+    cap_drop:
+      - ALL
+    entrypoint: ["/docker-entrypoint.sh"]
+  tika:
+    image: apache/tike:2.0.0
+    networks:
+      - tikanet
+    cap_drop:
+      - ALL
+networks:
+  mainnet:
+  tikanet:
diff --git a/docker-entrypoint.sh b/docker-entrypoint.sh
new file mode 100755
index 0000000..58d63ce
--- /dev/null
+++ b/docker-entrypoint.sh
@@ -0,0 +1,7 @@
+#!/usr/bin/env sh
+
+if [ "$SERVER_DEPLOYMENT_TYPE" = "deployment" ]; then
+  uvicorn main:app --host 0.0.0.0 --port 80 --ssl-certfile /certs/server.cert --ssl-keyfile /certs/server.key
+elif [ "$SERVER_DEPLOYMENT_TYPE" = "test" ]; then
+  uvicorn main:app --host 0.0.0.0 --port 80 --ssl-certfile /certs/server.cert --ssl-keyfile /certs/server.key
+fi
diff --git a/pyproject.toml b/pyproject.toml
index ca26b0f..1c677c6 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -12,11 +12,11 @@ beautifulsoup4 = "^4.9.3"
 readability-lxml = "^0.8.1"
 gtts = "^2.2.3"
 tika = "^1.24"
-docker = "^5.0.2"
 nltk = "^3.6.3"
 tensorflow = "^2.6.0"
 torch = "^1.9.1"
 transformers = "^4.11.2"
+fastapi = "^0.70.0"
 
 [tool.poetry.dev-dependencies]
author	terminaldweller <thabogre@gmail.com>	2021-11-11 17:58:22 +0000
committer	terminaldweller <thabogre@gmail.com>	2021-11-11 17:58:22 +0000
commit	6777413a7f9c8d9ab784e7a39dfb5f60e405cb7b (patch)
tree	691ae46a355c35230cbba30c6f80e766a0d3158c
parent	update (diff)
download	devourer-6777413a7f9c8d9ab784e7a39dfb5f60e405cb7b.tar.gz devourer-6777413a7f9c8d9ab784e7a39dfb5f60e405cb7b.zip