aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorterminaldweller <thabogre@gmail.com>2021-11-11 17:58:22 +0000
committerterminaldweller <thabogre@gmail.com>2021-11-11 17:58:22 +0000
commit6777413a7f9c8d9ab784e7a39dfb5f60e405cb7b (patch)
tree691ae46a355c35230cbba30c6f80e766a0d3158c
parentupdate (diff)
downloaddevourer-6777413a7f9c8d9ab784e7a39dfb5f60e405cb7b.tar.gz
devourer-6777413a7f9c8d9ab784e7a39dfb5f60e405cb7b.zip
WIP
-rw-r--r--Dockerfile13
-rw-r--r--[-rwxr-xr-x]devourer.py330
-rw-r--r--docker-compose-test.yaml28
-rw-r--r--docker-compose.yaml28
-rwxr-xr-xdocker-entrypoint.sh7
-rw-r--r--pyproject.toml2
6 files changed, 261 insertions, 147 deletions
diff --git a/Dockerfile b/Dockerfile
index 7e5641c..3cf3409 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -19,9 +19,16 @@ WORKDIR $PYSETUP_PATH
COPY ./pyproject.toml ./
RUN poetry install --no-dev
+FROM node:lts-alpine3.13 AS certbuilder
+RUN apk add openssl
+WORKDIR /certs
+RUN openssl req -nodes -new -x509 -subj="/C=US/ST=Denial/L=springfield/O=Dis/CN=localhost" -keyout server.key -out server.cert
+
FROM python-base as production
+RUN pip3 install uvicorn
+COPY --from=certbuilder /certs/ /certs
ENV FASTAPI_ENV=production
COPY --from=builder-base $VENV_PATH $VENV_PATH
-COPY ./main.py $PYSETUP_PATH/main.py
-ENTRYPOINT ["$PYSETUP_PATH/main.py"]
-# CMD ["--source", "https://github.com/coinpride/CryptoList"]
+COPY ./docker-entrypoint.sh /docker-entrypoint.sh
+COPY ./devourer.py $PYSETUP_PATH/devourer.py
+WORKDIR $PYSETUP_PATH
diff --git a/devourer.py b/devourer.py
index e869c4c..cc25206 100755..100644
--- a/devourer.py
+++ b/devourer.py
@@ -1,14 +1,11 @@
-#!/usr/bin/env python3
# _*_ coding=utf-8 _*_
-import argparse
import logging
import tika
-import docker
-import os
import nltk
import random
import string
+import os
from newspaper import Article, build, Config
from bs4 import BeautifulSoup
from contextlib import closing
@@ -18,69 +15,7 @@ from re import findall
from readability import Document
from gtts import gTTS
from datetime import datetime as time
-
-
-WIKIPEDIA_SEARCH_URL = "https://en.wikipedia.org/w/api.php"
-
-
-class Argparser(object):
- def __init__(self):
- parser = argparse.ArgumentParser()
- parser.add_argument(
- "--source",
- type=str,
- help="the url where the \
- urls to be extracted reside",
- default="",
- )
- parser.add_argument(
- "--out",
- type=str,
- help="the output file name if it applies",
- default="",
- )
- parser.add_argument(
- "--singlelink",
- action="store_true",
- help="whether the app should work in single-link \
- meaning only one page's contents will be used \
- mode",
- default=False,
- )
- parser.add_argument(
- "--multilink",
- action="store_true",
- help="whether the app should work in multi-link \
- mode meaning the srouce contians a list of links \
- rather than being the actual source itself",
- default=False,
- )
- parser.add_argument(
- "--sourcetype",
- type=str,
- help="determines the type of the \
- source:html,text,...",
- default="html",
- )
- parser.add_argument(
- "--pdftomp3",
- action="store_true",
- default=False,
- help="convert pdf to mp3. \
- source should be the path to a pdf file and\
- out should be the path to the mp3 output file",
- )
- parser.add_argument(
- "--summary",
- type=str,
- default="newspaper",
- help="which summary type to use. currently we \
- have newspaper, bart and none.",
- )
- parser.add_argument(
- "--search", type=str, default="", help="the string to search for"
- )
- self.args = parser.parse_args()
+from fastapi import FastAPI
# FIXME-maybe actually really do some logging
@@ -109,7 +44,7 @@ def simpleGet(url: str) -> bytes:
def getWithParams(url: str, params: dict) -> dict:
- """Issues a get requesti with params."""
+ """Issues a get request with params."""
try:
with closing(get(url, params=params, stream=True)) as resp:
if isAGoodResponse(resp):
@@ -160,24 +95,19 @@ def configNews(config: Config) -> None:
config.browser_user_agent = "Chrome/91.0.4464.5"
-def pdfToVoice(argparser: Argparser) -> None:
+# FIXME-have to decide whether to use files or urls
+def pdfToVoice() -> str:
"""Main function for converting a pdf to an mp3."""
- TIKA_SERVER_ENDPOINT = "127.0.0.1:9977"
- os.environ["TIKA_SERVER_ENDPOINT"] = TIKA_SERVER_ENDPOINT
- dockerClient = docker.from_env()
- container = dockerClient.containers.run(
- "apache/tika:2.0.0", detach=True, ports={TIKA_SERVER_ENDPOINT: "9998"}
- )
- while True:
- resp = get("http://127.0.0.1:9977")
- if resp.status_code == 200:
- break
- time.sleep(0.5)
- rawText = tika.parser.from_file()
- tts = gTTS(rawText["content"])
- tts.save(argparser.args.out)
- container.stop()
- dockerClient.close()
+ outfile = str()
+ try:
+ rawText = tika.parser.from_file()
+ tts = gTTS(rawText["content"])
+ outfile = getRandStr(20) + ".mp3"
+ tts.save(outfile)
+ except Exception as e:
+ logging.exception(e)
+ finally:
+ return outfile
def extractRequirements(textBody: str) -> list:
@@ -228,90 +158,204 @@ def summarizeText(text: str) -> str:
]
-def textToAudio(text: str) -> None:
+def textToAudio(text: str) -> str:
"""Transform the given text into audio."""
- tts = gTTS(text)
- tts.save(time.today().strftime("%b-%d-%Y-%M-%S-%f") + ".mp3")
+ try:
+ path = str()
+ path = (
+ os.environ["AUDIO_DUMP_DIR"]
+ + time.today().strftime("%b-%d-%Y-%M-%S-%f")
+ + ".mp3"
+ )
+ tts = gTTS(text)
+ tts.save(path)
+ except Exception as e:
+ logging.exception(e)
+ finally:
+ return path
-def singleLinkMode(argparser: Argparser) -> dict:
+def getRequirements(url: str, sourcetype: str) -> list:
"""Runs the single-link main function."""
- if argparser.args.sourcetype == "html":
- parser = build(argparser.args.source)
- for article in parser.articles:
- a = Article(article.url)
- try:
+ result = str()
+ results = list()
+ try:
+ if sourcetype == "html":
+ parser = build(url)
+ for article in parser.articles:
+ a = Article(article.url)
a.download()
a.parse()
doc = Document(a.html)
- print(doc.summary())
- extractRequirements(doc.summary())
- except Exception as e:
- logging.exception(e)
- elif argparser.args.sourcetype == "text":
- bytesText = simpleGet(argparser.args.source)
- extractRequirements(bytesText.decode("utf-8"))
+ # print(doc.summary())
+ results = extractRequirements(doc.summary())
+ elif sourcetype == "text":
+ bytesText = simpleGet(url)
+ results = extractRequirements(bytesText.decode("utf-8"))
+ except Exception as e:
+ logging.exception(e)
+ finally:
+ result = "".join(results + "\n")
+ return result
-def summarizeLinkToAudio(argparser: Argparser) -> None:
+def summarizeLinkToAudio(url, summary) -> str:
"""Summarizes the text inside a given url into audio."""
+ result = str()
try:
- article = Article(argparser.args.source)
+ article = Article(url)
article.download()
article.parse()
- if argparser.args.summary == "newspaper":
+ if summary == "newspaper":
article.nlp()
- textToAudio(article.summary)
- elif argparser.args.summary == "none":
- textToAudio(article.text)
- elif argparser.args.summary == "bart":
- textToAudio(summarizeText(article.text))
+ result = article.summary
+ elif summary == "none":
+ result = article.text
+ elif summary == "bart":
+ result = article.text
else:
- print("invalid option for summry type.")
+ print("invalid option for summary type.")
+ result = None
except Exception as e:
+ result = None
logging.exception(e)
+ finally:
+ return result
-def summarizeLinksToAudio(argparser: Argparser) -> None:
+def summarizeLinksToAudio(url, summary) -> None:
"""Summarize a list of urls into audio files."""
- config = Config()
- configNews(config)
- urls = getURLS(argparser.args.source)
- for url in urls:
- summarizeLinkToAudio(url)
+ results = list()
+ result = str()
+ try:
+ config = Config()
+ configNews(config)
+ urls = getURLS(url, summary)
+ for url in urls:
+ results.append(summarizeLinkToAudio(url))
+ except Exception as e:
+ logging.exception(e)
+ finally:
+ result = "".join(results)
+ return result
-def searchWikipedia(argparser: Argparser) -> str:
+def searchWikipedia(search_term: str) -> str:
"""Search wikipedia for a string and return the url.
reference: https://www.mediawiki.org/wiki/API:Opensearch
"""
- searchParmas = {
- "action": "opensearch",
- "namespace": "0",
- "search": argparser.args.search,
- "limit": "10",
- "format": "json",
+ result = str()
+ try:
+ searchParmas = {
+ "action": "opensearch",
+ "namespace": "0",
+ "search": search_term,
+ "limit": "10",
+ "format": "json",
+ }
+ res = getWithParams(os.environ["WIKI_SEARCH_URL"], searchParmas)
+ # FIXME-handle wiki redirects/disambiguations
+ # argparser.args.source = res[3][0]
+ print(res)
+ except Exception as e:
+ logging.exception(e)
+ finally:
+ return result
+
+
+def getAudioFromFile(audio_path: str) -> str:
+ """Returns the contents of a file in binary format"""
+ with open(audio_path, "rb") as audio:
+ return audio.read()
+
+
+app = FastAPI()
+
+
+@app.get("/tika")
+async def pdf_to_audio_ep(url: str):
+ """turns a pdf into an audiofile"""
+ audio_path = pdfToVoice()
+ return {
+ "Content-Type": "application/json",
+ "isOK": True if audio_path != "" else False,
+ "audio": getAudioFromFile(audio_path) if audio_path != "" else "",
}
- res = getWithParams(WIKIPEDIA_SEARCH_URL, searchParmas)
- print(res)
- argparser.args.source = res[3][0]
- summarizeLinkToAudio(argparser)
-
-
-def main() -> None:
- argparser = Argparser()
- if argparser.args.singlelink:
- summarizeLinkToAudio(argparser)
- elif argparser.args.multilink:
- summarizeLinksToAudio(argparser)
- elif argparser.args.pdftomp3:
- pdfToVoice(argparser)
- elif argparser.args.search:
- searchWikipedia(argparser)
- else:
- pass
-if __name__ == "__main__":
- main()
+@app.get("/reqs")
+async def extract_reqs_ep(url: str, sourcetype: str = "html"):
+ """extracts the requirements from a given url"""
+ result = getRequirements()
+ return {
+ "Content-Type": "application/json",
+ "isOK": True if result != "" else False,
+ "reqs": result,
+ }
+
+
+@app.get("/wiki")
+async def wiki_search_ep(term: str, audio: bool = False):
+ """search and summarizes from wikipedia"""
+ text = searchWikipedia(term)
+ if audio:
+ audio_path = textToAudio(text)
+ return {
+ "Content-Type": "application/json",
+ "isOK": (True if audio_path != "" else False)
+ and (True if text != "" else False),
+ "audio": getAudioFromFile(audio_path) if audio_path != "" else "",
+ "text": text,
+ }
+ else:
+ return {
+ "Content-Type": "application/json",
+ "isOK": True if text != "" else False,
+ "audio": "",
+ "text": text,
+ }
+
+
+@app.get("/summ")
+async def summarize_ep(url: str, summary: str = "none", audio: bool = False):
+ """summarize and turn the summary into audio"""
+ text = summarizeLinkToAudio(url, summary)
+ if audio:
+ audio_path = textToAudio(text)
+ return {
+ "Content-Type": "application/json",
+ "isOK": (True if audio_path != "" else False)
+ and (True if text != "" else False),
+ "audio": getAudioFromFile(audio_path) if audio_path != "" else "",
+ "text": text,
+ }
+ else:
+ return {
+ "Content-Type": "application/json",
+ "isOK": True if text != "" else False,
+ "audio": "",
+ "text": text,
+ }
+
+
+@app.get("/mila")
+async def mila_ep(url: str, summary: str = "newspaper", audio: bool = False):
+ """extract all the urls and then summarize and turn into audio"""
+ text = summarizeLinksToAudio(url, summary)
+ if audio:
+ audio_path = textToAudio(text)
+ return {
+ "Content-Type": "application/json",
+ "isOK": (True if audio_path != "" else False)
+ and (True if text != "" else False),
+ "audio": getAudioFromFile(audio_path) if audio_path != "" else "",
+ "text": text,
+ }
+ else:
+ return {
+ "Content-Type": "application/json",
+ "isOK": True if text != "" else False,
+ "audio": "",
+ "text": text,
+ }
diff --git a/docker-compose-test.yaml b/docker-compose-test.yaml
new file mode 100644
index 0000000..fea638b
--- /dev/null
+++ b/docker-compose-test.yaml
@@ -0,0 +1,28 @@
+version: "3.7"
+services:
+ devourer:
+ image: devourer
+ networks:
+ - mainnet
+ - tikanet
+ depends_on:
+ - tika
+ ports:
+ - "9009:80"
+ environment:
+ - TIKA_SERVER_ENDPOINT="tika:9998"
+ - AUDIO_DUMP_DIR="/tmp"
+ - WIKI_SEARCH_URL="https://en.wikipedia.org/w/api.php"
+ - SERVER_DEPLOYMENT_TYPE=test
+ cap_drop:
+ - ALL
+ entrypoint: ["/docker-entrypoint.sh"]
+ tika:
+ image: apache/tike:2.0.0
+ networks:
+ - tikanet
+ cap_drop:
+ - ALL
+networks:
+ mainnet:
+ tikanet:
diff --git a/docker-compose.yaml b/docker-compose.yaml
new file mode 100644
index 0000000..5a2bccf
--- /dev/null
+++ b/docker-compose.yaml
@@ -0,0 +1,28 @@
+version: "3.7"
+services:
+ devourer:
+ image: devourer
+ networks:
+ - mainnet
+ - tikanet
+ depends_on:
+ - tika
+ ports:
+ - "9009:80"
+ environment:
+ - TIKA_SERVER_ENDPOINT="tika:9998"
+ - AUDIO_DUMP_DIR="/tmp"
+ - WIKI_SEARCH_URL="https://en.wikipedia.org/w/api.php"
+ - SERVER_DEPLOYMENT_TYPE=deployment
+ cap_drop:
+ - ALL
+ entrypoint: ["/docker-entrypoint.sh"]
+ tika:
+ image: apache/tike:2.0.0
+ networks:
+ - tikanet
+ cap_drop:
+ - ALL
+networks:
+ mainnet:
+ tikanet:
diff --git a/docker-entrypoint.sh b/docker-entrypoint.sh
new file mode 100755
index 0000000..58d63ce
--- /dev/null
+++ b/docker-entrypoint.sh
@@ -0,0 +1,7 @@
+#!/usr/bin/env sh
+
+if [ "$SERVER_DEPLOYMENT_TYPE" = "deployment" ]; then
+ uvicorn main:app --host 0.0.0.0 --port 80 --ssl-certfile /certs/server.cert --ssl-keyfile /certs/server.key
+elif [ "$SERVER_DEPLOYMENT_TYPE" = "test" ]; then
+ uvicorn main:app --host 0.0.0.0 --port 80 --ssl-certfile /certs/server.cert --ssl-keyfile /certs/server.key
+fi
diff --git a/pyproject.toml b/pyproject.toml
index ca26b0f..1c677c6 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -12,11 +12,11 @@ beautifulsoup4 = "^4.9.3"
readability-lxml = "^0.8.1"
gtts = "^2.2.3"
tika = "^1.24"
-docker = "^5.0.2"
nltk = "^3.6.3"
tensorflow = "^2.6.0"
torch = "^1.9.1"
transformers = "^4.11.2"
+fastapi = "^0.70.0"
[tool.poetry.dev-dependencies]