From e43d90efe20ed06090ad1034b789e7a1b6ee00fe Mon Sep 17 00:00:00 2001 From: terminaldweller Date: Sat, 2 Apr 2022 02:35:40 +0430 Subject: wip --- devourer.py | 110 ++++++++++++++++++++++++++++++++++----------------- docker-entrypoint.sh | 18 ++++++++- pyproject.toml | 4 +- 3 files changed, 92 insertions(+), 40 deletions(-) diff --git a/devourer.py b/devourer.py index 8196050..b0ce45a 100644 --- a/devourer.py +++ b/devourer.py @@ -1,31 +1,30 @@ # _*_ coding=utf-8 _*_ +import bs4 +import contextlib +import datetime +import fastapi +import gtts import logging -import tika +import newspaper import nltk +import os import random +import re +import readability +import requests import string -import os -from newspaper import Article, build, Config -from bs4 import BeautifulSoup -from contextlib import closing -from requests import get, Response -from requests.exceptions import RequestException -from re import findall -from readability import Document -from gtts import gTTS -from datetime import datetime as time -from fastapi import FastAPI -from fastapi import Response as APIResponse +import tika +import transformers # FIXME-maybe actually really do some logging -def logError(err: RequestException) -> None: +def logError(err: requests.exceptions.RequestException) -> None: """Logs the errors.""" logging.exception(err) -def isAGoodResponse(resp: Response) -> bool: +def isAGoodResponse(resp: requests.Response) -> bool: """Checks whether the get we sent got a 200 response.""" content_type = resp.headers["Content-Type"].lower() return resp.status_code == 200 and content_type is not None @@ -34,12 +33,12 @@ def isAGoodResponse(resp: Response) -> bool: def simpleGet(url: str) -> bytes: """Issues a simple get request.""" try: - with closing(get(url, stream=True)) as resp: + with contextlib.closing(requests.get(url, stream=True)) as resp: if isAGoodResponse(resp): return resp.content else: return None - except RequestException as e: + except requests.exceptions.RequestException as e: logError("Error during requests to {0} : {1}".format(url, str(e))) return None @@ -47,12 +46,14 @@ def simpleGet(url: str) -> bytes: def getWithParams(url: str, params: dict) -> dict: """Issues a get request with params.""" try: - with closing(get(url, params=params, stream=True)) as resp: + with contextlib.closing( + requests.get(url, params=params, stream=True) + ) as resp: if isAGoodResponse(resp): return resp.json() else: return None - except RequestException as e: + except requests.exceptions.RequestException as e: logError("Error during requests to {0} : {1}".format(url, str(e))) return None @@ -66,7 +67,7 @@ def getURLS(source: str) -> dict: """Extracts the urls from a website.""" result = dict() raw_ml = simpleGet(source) - ml = BeautifulSoup(raw_ml, "lxml") + ml = bs4.BeautifulSoup(raw_ml, "lxml") rand_tmp = "/tmp/" + getRandStr(20) ml_str = repr(ml) @@ -76,7 +77,7 @@ def getURLS(source: str) -> dict: tmp = open(rand_tmp, "r") url_list = [] for line in tmp: - url = findall( + url = re.findall( "http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*(),]|" r"(?:%[0-9a-fA-F][0-9a-fA-F]))+", line, @@ -88,7 +89,7 @@ def getURLS(source: str) -> dict: return result -def configNews(config: Config) -> None: +def configNews(config: newspaper.Config) -> None: """Configures newspaper.""" config.fetch_images = False config.keep_article_html = True @@ -102,7 +103,7 @@ def pdfToVoice() -> str: outfile = str() try: rawText = tika.parser.from_file() - tts = gTTS(rawText["content"]) + tts = gtts.gTTS(rawText["content"]) outfile = getRandStr(20) + ".mp3" tts.save(outfile) except Exception as e: @@ -138,7 +139,6 @@ def extractRequirements(textBody: str) -> list: def summarizeText(text: str) -> str: """Summarize the given text using bart.""" - import transformers model = transformers.BartForConditionalGeneration.from_pretrained( "facebook/bart-large-cnn" @@ -162,8 +162,8 @@ def textToAudio(text: str) -> str: """Transform the given text into audio.""" path = str() try: - time_str = time.today().strftime("%b-%d-%Y-%M-%S-%f") - tts = gTTS(text) + time_str = datetime.datetime.today().strftime("%b-%d-%Y-%M-%S-%f") + tts = gtts.gTTS(text) tts.save(os.environ["AUDIO_DUMP_DIR"] + "/" + time_str + ".mp3") path = os.environ["AUDIO_DUMP_DIR"] + "/" + time_str + ".mp3" except Exception as e: @@ -178,13 +178,13 @@ def getRequirements(url: str, sourcetype: str) -> list: results = list() try: if sourcetype == "html": - parser = build(url) + parser = newspaper.build(url) for article in parser.articles: - a = Article(article.url) + a = newspaper.Article(article.url) a.download() a.parse() a.nlp() - doc = Document(a.html) + doc = readability.Document(a.html) print(doc) # print(doc.summary()) # results = extractRequirements(doc.summary()) @@ -206,7 +206,7 @@ def summarizeLinkToAudio(url, summary) -> str: """Summarizes the text inside a given url into audio.""" result = str() try: - article = Article(url) + article = newspaper.Article(url) article.download() article.parse() if summary == "newspaper": @@ -230,7 +230,7 @@ def summarizeLinksToAudio(url, summary) -> None: results = list() result = str() try: - config = Config() + config = newspaper.Config() configNews(config) urls = getURLS(url, summary) for url in urls: @@ -272,7 +272,38 @@ def getAudioFromFile(audio_path: str) -> str: return audio.read() -app = FastAPI() +def getSentiments() -> list: + """Get sentiments""" + results = list() + SOURCE = "https://github.com/coinpride/CryptoList" + urls = simpleGet(SOURCE) + classifier = transformers.pipeline("sentiment-analysis") + for url in urls: + req_result = simpleGet(url) + results.append(classifier(req_result)) + return results + + +app = fastapi.FastAPI() + + +# https://cheatsheetseries.owasp.org/cheatsheets/REST_Security_Cheat_Sheet.html +@app.middleware("http") +async def addSecureHeaders( + request: fastapi.Request, call_next +) -> fastapi.Response: + """adds security headers proposed by OWASP""" + response = await call_next(request) + response.headers["Cache-Control"] = "no-store" + response.headers["Content-Security-Policy"] = "default-src-https" + response.headers["Strict-Transport-Security"] = "max-age=63072000" + response.headers["X-Content-Type-Options"] = "nosniff" + response.headers["X-Frame-Options"] = "DENY" + response.headers["Access-Control-Allow-Methods"] = "GET,OPTIONS" + return response + + +app.add_middleware(addSecureHeaders) nltk.download("punkt") @@ -280,7 +311,7 @@ nltk.download("punkt") def pdf_to_audio_ep(url: str): """turns a pdf into an audiofile""" audio_path = pdfToVoice() - return APIResponse( + return fastapi.Response( getAudioFromFile(audio_path) if audio_path != "" else "", media_type="audio/mpeg", ) @@ -303,7 +334,7 @@ def wiki_search_ep(term: str, summary: str = "none", audio: bool = False): text = searchWikipedia(term, summary) if audio: audio_path = textToAudio(text) - return APIResponse( + return fastapi.Response( getAudioFromFile(audio_path) if audio_path != "" else "", media_type="audio/mpeg", ) @@ -323,7 +354,7 @@ def summarize_ep(url: str, summary: str = "none", audio: bool = False): if audio: audio_path = textToAudio(text) print(audio_path) - return APIResponse( + return fastapi.Response( getAudioFromFile(audio_path) if audio_path != "" else "", media_type="audio/mpeg", ) @@ -343,7 +374,7 @@ def mila_ep(url: str, summary: str = "newspaper", audio: bool = False): if audio: audio_path = textToAudio(text) print(audio_path) - return APIResponse( + return fastapi.Response( getAudioFromFile(audio_path) if audio_path != "" else "", media_type="audio/mpeg", ) @@ -356,6 +387,13 @@ def mila_ep(url: str, summary: str = "newspaper", audio: bool = False): } +@app.get("/mila/sentiments") +def sentiments_endpoint(url: str): + """the sentiments endpoint""" + sentiments = getSentiments() + return {"Content-Type": "application/json", "Sentiments": sentiments} + + @app.get("/mila/health") def health_ep(): return {"isOK": True} diff --git a/docker-entrypoint.sh b/docker-entrypoint.sh index 92e2537..43159f2 100755 --- a/docker-entrypoint.sh +++ b/docker-entrypoint.sh @@ -1,7 +1,21 @@ #!/usr/bin/env sh if [ "$SERVER_DEPLOYMENT_TYPE" = "deployment" ]; then - uvicorn devourer:app --host 0.0.0.0 --port 80 --ssl-certfile /certs/server.cert --ssl-keyfile /certs/server.key + uvicorn devourer:app \ + --host 0.0.0.0 \ + --port 80 \ + --ssl-certfile /certs/server.cert \ + --ssl-keyfile /certs/server.key \ + --no-proxy-headers \ + --no-server-headers \ + --no-date-headers elif [ "$SERVER_DEPLOYMENT_TYPE" = "test" ]; then - uvicorn devourer:app --host 0.0.0.0 --port 80 --ssl-certfile /certs/server.cert --ssl-keyfile /certs/server.key + uvicorn devourer:app \ + --host 0.0.0.0 \ + --port 80 \ + --ssl-certfile /certs/server.cert \ + --ssl-keyfile /certs/server.key \ + --no-proxy-headers \ + --no-server-headers \ + --no-date-headers fi diff --git a/pyproject.toml b/pyproject.toml index 95b9f5d..d0fa291 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,12 +1,12 @@ [tool.poetry] name = "devourer" -version = "0.1.0" +version = "1.0.0" description = "" authors = ["terminaldweller "] license = "GPL3.0" [tool.poetry.dependencies] -python = "^3.8" +python = "3.8" newspaper3k = "^0.2.8" beautifulsoup4 = "^4.9.3" readability-lxml = "^0.8.1" -- cgit v1.2.3