diff options
author | terminaldweller <thabogre@gmail.com> | 2021-11-11 17:58:22 +0000 |
---|---|---|
committer | terminaldweller <thabogre@gmail.com> | 2021-11-11 17:58:22 +0000 |
commit | 6777413a7f9c8d9ab784e7a39dfb5f60e405cb7b (patch) | |
tree | 691ae46a355c35230cbba30c6f80e766a0d3158c | |
parent | update (diff) | |
download | devourer-6777413a7f9c8d9ab784e7a39dfb5f60e405cb7b.tar.gz devourer-6777413a7f9c8d9ab784e7a39dfb5f60e405cb7b.zip |
WIP
-rw-r--r-- | Dockerfile | 13 | ||||
-rw-r--r--[-rwxr-xr-x] | devourer.py | 330 | ||||
-rw-r--r-- | docker-compose-test.yaml | 28 | ||||
-rw-r--r-- | docker-compose.yaml | 28 | ||||
-rwxr-xr-x | docker-entrypoint.sh | 7 | ||||
-rw-r--r-- | pyproject.toml | 2 |
6 files changed, 261 insertions, 147 deletions
@@ -19,9 +19,16 @@ WORKDIR $PYSETUP_PATH COPY ./pyproject.toml ./ RUN poetry install --no-dev +FROM node:lts-alpine3.13 AS certbuilder +RUN apk add openssl +WORKDIR /certs +RUN openssl req -nodes -new -x509 -subj="/C=US/ST=Denial/L=springfield/O=Dis/CN=localhost" -keyout server.key -out server.cert + FROM python-base as production +RUN pip3 install uvicorn +COPY --from=certbuilder /certs/ /certs ENV FASTAPI_ENV=production COPY --from=builder-base $VENV_PATH $VENV_PATH -COPY ./main.py $PYSETUP_PATH/main.py -ENTRYPOINT ["$PYSETUP_PATH/main.py"] -# CMD ["--source", "https://github.com/coinpride/CryptoList"] +COPY ./docker-entrypoint.sh /docker-entrypoint.sh +COPY ./devourer.py $PYSETUP_PATH/devourer.py +WORKDIR $PYSETUP_PATH diff --git a/devourer.py b/devourer.py index e869c4c..cc25206 100755..100644 --- a/devourer.py +++ b/devourer.py @@ -1,14 +1,11 @@ -#!/usr/bin/env python3 # _*_ coding=utf-8 _*_ -import argparse import logging import tika -import docker -import os import nltk import random import string +import os from newspaper import Article, build, Config from bs4 import BeautifulSoup from contextlib import closing @@ -18,69 +15,7 @@ from re import findall from readability import Document from gtts import gTTS from datetime import datetime as time - - -WIKIPEDIA_SEARCH_URL = "https://en.wikipedia.org/w/api.php" - - -class Argparser(object): - def __init__(self): - parser = argparse.ArgumentParser() - parser.add_argument( - "--source", - type=str, - help="the url where the \ - urls to be extracted reside", - default="", - ) - parser.add_argument( - "--out", - type=str, - help="the output file name if it applies", - default="", - ) - parser.add_argument( - "--singlelink", - action="store_true", - help="whether the app should work in single-link \ - meaning only one page's contents will be used \ - mode", - default=False, - ) - parser.add_argument( - "--multilink", - action="store_true", - help="whether the app should work in multi-link \ - mode meaning the srouce contians a list of links \ - rather than being the actual source itself", - default=False, - ) - parser.add_argument( - "--sourcetype", - type=str, - help="determines the type of the \ - source:html,text,...", - default="html", - ) - parser.add_argument( - "--pdftomp3", - action="store_true", - default=False, - help="convert pdf to mp3. \ - source should be the path to a pdf file and\ - out should be the path to the mp3 output file", - ) - parser.add_argument( - "--summary", - type=str, - default="newspaper", - help="which summary type to use. currently we \ - have newspaper, bart and none.", - ) - parser.add_argument( - "--search", type=str, default="", help="the string to search for" - ) - self.args = parser.parse_args() +from fastapi import FastAPI # FIXME-maybe actually really do some logging @@ -109,7 +44,7 @@ def simpleGet(url: str) -> bytes: def getWithParams(url: str, params: dict) -> dict: - """Issues a get requesti with params.""" + """Issues a get request with params.""" try: with closing(get(url, params=params, stream=True)) as resp: if isAGoodResponse(resp): @@ -160,24 +95,19 @@ def configNews(config: Config) -> None: config.browser_user_agent = "Chrome/91.0.4464.5" -def pdfToVoice(argparser: Argparser) -> None: +# FIXME-have to decide whether to use files or urls +def pdfToVoice() -> str: """Main function for converting a pdf to an mp3.""" - TIKA_SERVER_ENDPOINT = "127.0.0.1:9977" - os.environ["TIKA_SERVER_ENDPOINT"] = TIKA_SERVER_ENDPOINT - dockerClient = docker.from_env() - container = dockerClient.containers.run( - "apache/tika:2.0.0", detach=True, ports={TIKA_SERVER_ENDPOINT: "9998"} - ) - while True: - resp = get("http://127.0.0.1:9977") - if resp.status_code == 200: - break - time.sleep(0.5) - rawText = tika.parser.from_file() - tts = gTTS(rawText["content"]) - tts.save(argparser.args.out) - container.stop() - dockerClient.close() + outfile = str() + try: + rawText = tika.parser.from_file() + tts = gTTS(rawText["content"]) + outfile = getRandStr(20) + ".mp3" + tts.save(outfile) + except Exception as e: + logging.exception(e) + finally: + return outfile def extractRequirements(textBody: str) -> list: @@ -228,90 +158,204 @@ def summarizeText(text: str) -> str: ] -def textToAudio(text: str) -> None: +def textToAudio(text: str) -> str: """Transform the given text into audio.""" - tts = gTTS(text) - tts.save(time.today().strftime("%b-%d-%Y-%M-%S-%f") + ".mp3") + try: + path = str() + path = ( + os.environ["AUDIO_DUMP_DIR"] + + time.today().strftime("%b-%d-%Y-%M-%S-%f") + + ".mp3" + ) + tts = gTTS(text) + tts.save(path) + except Exception as e: + logging.exception(e) + finally: + return path -def singleLinkMode(argparser: Argparser) -> dict: +def getRequirements(url: str, sourcetype: str) -> list: """Runs the single-link main function.""" - if argparser.args.sourcetype == "html": - parser = build(argparser.args.source) - for article in parser.articles: - a = Article(article.url) - try: + result = str() + results = list() + try: + if sourcetype == "html": + parser = build(url) + for article in parser.articles: + a = Article(article.url) a.download() a.parse() doc = Document(a.html) - print(doc.summary()) - extractRequirements(doc.summary()) - except Exception as e: - logging.exception(e) - elif argparser.args.sourcetype == "text": - bytesText = simpleGet(argparser.args.source) - extractRequirements(bytesText.decode("utf-8")) + # print(doc.summary()) + results = extractRequirements(doc.summary()) + elif sourcetype == "text": + bytesText = simpleGet(url) + results = extractRequirements(bytesText.decode("utf-8")) + except Exception as e: + logging.exception(e) + finally: + result = "".join(results + "\n") + return result -def summarizeLinkToAudio(argparser: Argparser) -> None: +def summarizeLinkToAudio(url, summary) -> str: """Summarizes the text inside a given url into audio.""" + result = str() try: - article = Article(argparser.args.source) + article = Article(url) article.download() article.parse() - if argparser.args.summary == "newspaper": + if summary == "newspaper": article.nlp() - textToAudio(article.summary) - elif argparser.args.summary == "none": - textToAudio(article.text) - elif argparser.args.summary == "bart": - textToAudio(summarizeText(article.text)) + result = article.summary + elif summary == "none": + result = article.text + elif summary == "bart": + result = article.text else: - print("invalid option for summry type.") + print("invalid option for summary type.") + result = None except Exception as e: + result = None logging.exception(e) + finally: + return result -def summarizeLinksToAudio(argparser: Argparser) -> None: +def summarizeLinksToAudio(url, summary) -> None: """Summarize a list of urls into audio files.""" - config = Config() - configNews(config) - urls = getURLS(argparser.args.source) - for url in urls: - summarizeLinkToAudio(url) + results = list() + result = str() + try: + config = Config() + configNews(config) + urls = getURLS(url, summary) + for url in urls: + results.append(summarizeLinkToAudio(url)) + except Exception as e: + logging.exception(e) + finally: + result = "".join(results) + return result -def searchWikipedia(argparser: Argparser) -> str: +def searchWikipedia(search_term: str) -> str: """Search wikipedia for a string and return the url. reference: https://www.mediawiki.org/wiki/API:Opensearch """ - searchParmas = { - "action": "opensearch", - "namespace": "0", - "search": argparser.args.search, - "limit": "10", - "format": "json", + result = str() + try: + searchParmas = { + "action": "opensearch", + "namespace": "0", + "search": search_term, + "limit": "10", + "format": "json", + } + res = getWithParams(os.environ["WIKI_SEARCH_URL"], searchParmas) + # FIXME-handle wiki redirects/disambiguations + # argparser.args.source = res[3][0] + print(res) + except Exception as e: + logging.exception(e) + finally: + return result + + +def getAudioFromFile(audio_path: str) -> str: + """Returns the contents of a file in binary format""" + with open(audio_path, "rb") as audio: + return audio.read() + + +app = FastAPI() + + +@app.get("/tika") +async def pdf_to_audio_ep(url: str): + """turns a pdf into an audiofile""" + audio_path = pdfToVoice() + return { + "Content-Type": "application/json", + "isOK": True if audio_path != "" else False, + "audio": getAudioFromFile(audio_path) if audio_path != "" else "", } - res = getWithParams(WIKIPEDIA_SEARCH_URL, searchParmas) - print(res) - argparser.args.source = res[3][0] - summarizeLinkToAudio(argparser) - - -def main() -> None: - argparser = Argparser() - if argparser.args.singlelink: - summarizeLinkToAudio(argparser) - elif argparser.args.multilink: - summarizeLinksToAudio(argparser) - elif argparser.args.pdftomp3: - pdfToVoice(argparser) - elif argparser.args.search: - searchWikipedia(argparser) - else: - pass -if __name__ == "__main__": - main() +@app.get("/reqs") +async def extract_reqs_ep(url: str, sourcetype: str = "html"): + """extracts the requirements from a given url""" + result = getRequirements() + return { + "Content-Type": "application/json", + "isOK": True if result != "" else False, + "reqs": result, + } + + +@app.get("/wiki") +async def wiki_search_ep(term: str, audio: bool = False): + """search and summarizes from wikipedia""" + text = searchWikipedia(term) + if audio: + audio_path = textToAudio(text) + return { + "Content-Type": "application/json", + "isOK": (True if audio_path != "" else False) + and (True if text != "" else False), + "audio": getAudioFromFile(audio_path) if audio_path != "" else "", + "text": text, + } + else: + return { + "Content-Type": "application/json", + "isOK": True if text != "" else False, + "audio": "", + "text": text, + } + + +@app.get("/summ") +async def summarize_ep(url: str, summary: str = "none", audio: bool = False): + """summarize and turn the summary into audio""" + text = summarizeLinkToAudio(url, summary) + if audio: + audio_path = textToAudio(text) + return { + "Content-Type": "application/json", + "isOK": (True if audio_path != "" else False) + and (True if text != "" else False), + "audio": getAudioFromFile(audio_path) if audio_path != "" else "", + "text": text, + } + else: + return { + "Content-Type": "application/json", + "isOK": True if text != "" else False, + "audio": "", + "text": text, + } + + +@app.get("/mila") +async def mila_ep(url: str, summary: str = "newspaper", audio: bool = False): + """extract all the urls and then summarize and turn into audio""" + text = summarizeLinksToAudio(url, summary) + if audio: + audio_path = textToAudio(text) + return { + "Content-Type": "application/json", + "isOK": (True if audio_path != "" else False) + and (True if text != "" else False), + "audio": getAudioFromFile(audio_path) if audio_path != "" else "", + "text": text, + } + else: + return { + "Content-Type": "application/json", + "isOK": True if text != "" else False, + "audio": "", + "text": text, + } diff --git a/docker-compose-test.yaml b/docker-compose-test.yaml new file mode 100644 index 0000000..fea638b --- /dev/null +++ b/docker-compose-test.yaml @@ -0,0 +1,28 @@ +version: "3.7" +services: + devourer: + image: devourer + networks: + - mainnet + - tikanet + depends_on: + - tika + ports: + - "9009:80" + environment: + - TIKA_SERVER_ENDPOINT="tika:9998" + - AUDIO_DUMP_DIR="/tmp" + - WIKI_SEARCH_URL="https://en.wikipedia.org/w/api.php" + - SERVER_DEPLOYMENT_TYPE=test + cap_drop: + - ALL + entrypoint: ["/docker-entrypoint.sh"] + tika: + image: apache/tike:2.0.0 + networks: + - tikanet + cap_drop: + - ALL +networks: + mainnet: + tikanet: diff --git a/docker-compose.yaml b/docker-compose.yaml new file mode 100644 index 0000000..5a2bccf --- /dev/null +++ b/docker-compose.yaml @@ -0,0 +1,28 @@ +version: "3.7" +services: + devourer: + image: devourer + networks: + - mainnet + - tikanet + depends_on: + - tika + ports: + - "9009:80" + environment: + - TIKA_SERVER_ENDPOINT="tika:9998" + - AUDIO_DUMP_DIR="/tmp" + - WIKI_SEARCH_URL="https://en.wikipedia.org/w/api.php" + - SERVER_DEPLOYMENT_TYPE=deployment + cap_drop: + - ALL + entrypoint: ["/docker-entrypoint.sh"] + tika: + image: apache/tike:2.0.0 + networks: + - tikanet + cap_drop: + - ALL +networks: + mainnet: + tikanet: diff --git a/docker-entrypoint.sh b/docker-entrypoint.sh new file mode 100755 index 0000000..58d63ce --- /dev/null +++ b/docker-entrypoint.sh @@ -0,0 +1,7 @@ +#!/usr/bin/env sh + +if [ "$SERVER_DEPLOYMENT_TYPE" = "deployment" ]; then + uvicorn main:app --host 0.0.0.0 --port 80 --ssl-certfile /certs/server.cert --ssl-keyfile /certs/server.key +elif [ "$SERVER_DEPLOYMENT_TYPE" = "test" ]; then + uvicorn main:app --host 0.0.0.0 --port 80 --ssl-certfile /certs/server.cert --ssl-keyfile /certs/server.key +fi diff --git a/pyproject.toml b/pyproject.toml index ca26b0f..1c677c6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,11 +12,11 @@ beautifulsoup4 = "^4.9.3" readability-lxml = "^0.8.1" gtts = "^2.2.3" tika = "^1.24" -docker = "^5.0.2" nltk = "^3.6.3" tensorflow = "^2.6.0" torch = "^1.9.1" transformers = "^4.11.2" +fastapi = "^0.70.0" [tool.poetry.dev-dependencies] |