aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorterminaldweller <thabogre@gmail.com>2022-04-01 22:05:40 +0000
committerterminaldweller <thabogre@gmail.com>2022-04-01 22:05:40 +0000
commite43d90efe20ed06090ad1034b789e7a1b6ee00fe (patch)
treeeceacdd49995fcb08cfde73c836e3c0adc7ebb81
parentadded README. /wiki is working now. (diff)
downloaddevourer-e43d90efe20ed06090ad1034b789e7a1b6ee00fe.tar.gz
devourer-e43d90efe20ed06090ad1034b789e7a1b6ee00fe.zip
wip
-rw-r--r--devourer.py110
-rwxr-xr-xdocker-entrypoint.sh18
-rw-r--r--pyproject.toml4
3 files changed, 92 insertions, 40 deletions
diff --git a/devourer.py b/devourer.py
index 8196050..b0ce45a 100644
--- a/devourer.py
+++ b/devourer.py
@@ -1,31 +1,30 @@
# _*_ coding=utf-8 _*_
+import bs4
+import contextlib
+import datetime
+import fastapi
+import gtts
import logging
-import tika
+import newspaper
import nltk
+import os
import random
+import re
+import readability
+import requests
import string
-import os
-from newspaper import Article, build, Config
-from bs4 import BeautifulSoup
-from contextlib import closing
-from requests import get, Response
-from requests.exceptions import RequestException
-from re import findall
-from readability import Document
-from gtts import gTTS
-from datetime import datetime as time
-from fastapi import FastAPI
-from fastapi import Response as APIResponse
+import tika
+import transformers
# FIXME-maybe actually really do some logging
-def logError(err: RequestException) -> None:
+def logError(err: requests.exceptions.RequestException) -> None:
"""Logs the errors."""
logging.exception(err)
-def isAGoodResponse(resp: Response) -> bool:
+def isAGoodResponse(resp: requests.Response) -> bool:
"""Checks whether the get we sent got a 200 response."""
content_type = resp.headers["Content-Type"].lower()
return resp.status_code == 200 and content_type is not None
@@ -34,12 +33,12 @@ def isAGoodResponse(resp: Response) -> bool:
def simpleGet(url: str) -> bytes:
"""Issues a simple get request."""
try:
- with closing(get(url, stream=True)) as resp:
+ with contextlib.closing(requests.get(url, stream=True)) as resp:
if isAGoodResponse(resp):
return resp.content
else:
return None
- except RequestException as e:
+ except requests.exceptions.RequestException as e:
logError("Error during requests to {0} : {1}".format(url, str(e)))
return None
@@ -47,12 +46,14 @@ def simpleGet(url: str) -> bytes:
def getWithParams(url: str, params: dict) -> dict:
"""Issues a get request with params."""
try:
- with closing(get(url, params=params, stream=True)) as resp:
+ with contextlib.closing(
+ requests.get(url, params=params, stream=True)
+ ) as resp:
if isAGoodResponse(resp):
return resp.json()
else:
return None
- except RequestException as e:
+ except requests.exceptions.RequestException as e:
logError("Error during requests to {0} : {1}".format(url, str(e)))
return None
@@ -66,7 +67,7 @@ def getURLS(source: str) -> dict:
"""Extracts the urls from a website."""
result = dict()
raw_ml = simpleGet(source)
- ml = BeautifulSoup(raw_ml, "lxml")
+ ml = bs4.BeautifulSoup(raw_ml, "lxml")
rand_tmp = "/tmp/" + getRandStr(20)
ml_str = repr(ml)
@@ -76,7 +77,7 @@ def getURLS(source: str) -> dict:
tmp = open(rand_tmp, "r")
url_list = []
for line in tmp:
- url = findall(
+ url = re.findall(
"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*(),]|"
r"(?:%[0-9a-fA-F][0-9a-fA-F]))+",
line,
@@ -88,7 +89,7 @@ def getURLS(source: str) -> dict:
return result
-def configNews(config: Config) -> None:
+def configNews(config: newspaper.Config) -> None:
"""Configures newspaper."""
config.fetch_images = False
config.keep_article_html = True
@@ -102,7 +103,7 @@ def pdfToVoice() -> str:
outfile = str()
try:
rawText = tika.parser.from_file()
- tts = gTTS(rawText["content"])
+ tts = gtts.gTTS(rawText["content"])
outfile = getRandStr(20) + ".mp3"
tts.save(outfile)
except Exception as e:
@@ -138,7 +139,6 @@ def extractRequirements(textBody: str) -> list:
def summarizeText(text: str) -> str:
"""Summarize the given text using bart."""
- import transformers
model = transformers.BartForConditionalGeneration.from_pretrained(
"facebook/bart-large-cnn"
@@ -162,8 +162,8 @@ def textToAudio(text: str) -> str:
"""Transform the given text into audio."""
path = str()
try:
- time_str = time.today().strftime("%b-%d-%Y-%M-%S-%f")
- tts = gTTS(text)
+ time_str = datetime.datetime.today().strftime("%b-%d-%Y-%M-%S-%f")
+ tts = gtts.gTTS(text)
tts.save(os.environ["AUDIO_DUMP_DIR"] + "/" + time_str + ".mp3")
path = os.environ["AUDIO_DUMP_DIR"] + "/" + time_str + ".mp3"
except Exception as e:
@@ -178,13 +178,13 @@ def getRequirements(url: str, sourcetype: str) -> list:
results = list()
try:
if sourcetype == "html":
- parser = build(url)
+ parser = newspaper.build(url)
for article in parser.articles:
- a = Article(article.url)
+ a = newspaper.Article(article.url)
a.download()
a.parse()
a.nlp()
- doc = Document(a.html)
+ doc = readability.Document(a.html)
print(doc)
# print(doc.summary())
# results = extractRequirements(doc.summary())
@@ -206,7 +206,7 @@ def summarizeLinkToAudio(url, summary) -> str:
"""Summarizes the text inside a given url into audio."""
result = str()
try:
- article = Article(url)
+ article = newspaper.Article(url)
article.download()
article.parse()
if summary == "newspaper":
@@ -230,7 +230,7 @@ def summarizeLinksToAudio(url, summary) -> None:
results = list()
result = str()
try:
- config = Config()
+ config = newspaper.Config()
configNews(config)
urls = getURLS(url, summary)
for url in urls:
@@ -272,7 +272,38 @@ def getAudioFromFile(audio_path: str) -> str:
return audio.read()
-app = FastAPI()
+def getSentiments() -> list:
+ """Get sentiments"""
+ results = list()
+ SOURCE = "https://github.com/coinpride/CryptoList"
+ urls = simpleGet(SOURCE)
+ classifier = transformers.pipeline("sentiment-analysis")
+ for url in urls:
+ req_result = simpleGet(url)
+ results.append(classifier(req_result))
+ return results
+
+
+app = fastapi.FastAPI()
+
+
+# https://cheatsheetseries.owasp.org/cheatsheets/REST_Security_Cheat_Sheet.html
+@app.middleware("http")
+async def addSecureHeaders(
+ request: fastapi.Request, call_next
+) -> fastapi.Response:
+ """adds security headers proposed by OWASP"""
+ response = await call_next(request)
+ response.headers["Cache-Control"] = "no-store"
+ response.headers["Content-Security-Policy"] = "default-src-https"
+ response.headers["Strict-Transport-Security"] = "max-age=63072000"
+ response.headers["X-Content-Type-Options"] = "nosniff"
+ response.headers["X-Frame-Options"] = "DENY"
+ response.headers["Access-Control-Allow-Methods"] = "GET,OPTIONS"
+ return response
+
+
+app.add_middleware(addSecureHeaders)
nltk.download("punkt")
@@ -280,7 +311,7 @@ nltk.download("punkt")
def pdf_to_audio_ep(url: str):
"""turns a pdf into an audiofile"""
audio_path = pdfToVoice()
- return APIResponse(
+ return fastapi.Response(
getAudioFromFile(audio_path) if audio_path != "" else "",
media_type="audio/mpeg",
)
@@ -303,7 +334,7 @@ def wiki_search_ep(term: str, summary: str = "none", audio: bool = False):
text = searchWikipedia(term, summary)
if audio:
audio_path = textToAudio(text)
- return APIResponse(
+ return fastapi.Response(
getAudioFromFile(audio_path) if audio_path != "" else "",
media_type="audio/mpeg",
)
@@ -323,7 +354,7 @@ def summarize_ep(url: str, summary: str = "none", audio: bool = False):
if audio:
audio_path = textToAudio(text)
print(audio_path)
- return APIResponse(
+ return fastapi.Response(
getAudioFromFile(audio_path) if audio_path != "" else "",
media_type="audio/mpeg",
)
@@ -343,7 +374,7 @@ def mila_ep(url: str, summary: str = "newspaper", audio: bool = False):
if audio:
audio_path = textToAudio(text)
print(audio_path)
- return APIResponse(
+ return fastapi.Response(
getAudioFromFile(audio_path) if audio_path != "" else "",
media_type="audio/mpeg",
)
@@ -356,6 +387,13 @@ def mila_ep(url: str, summary: str = "newspaper", audio: bool = False):
}
+@app.get("/mila/sentiments")
+def sentiments_endpoint(url: str):
+ """the sentiments endpoint"""
+ sentiments = getSentiments()
+ return {"Content-Type": "application/json", "Sentiments": sentiments}
+
+
@app.get("/mila/health")
def health_ep():
return {"isOK": True}
diff --git a/docker-entrypoint.sh b/docker-entrypoint.sh
index 92e2537..43159f2 100755
--- a/docker-entrypoint.sh
+++ b/docker-entrypoint.sh
@@ -1,7 +1,21 @@
#!/usr/bin/env sh
if [ "$SERVER_DEPLOYMENT_TYPE" = "deployment" ]; then
- uvicorn devourer:app --host 0.0.0.0 --port 80 --ssl-certfile /certs/server.cert --ssl-keyfile /certs/server.key
+ uvicorn devourer:app \
+ --host 0.0.0.0 \
+ --port 80 \
+ --ssl-certfile /certs/server.cert \
+ --ssl-keyfile /certs/server.key \
+ --no-proxy-headers \
+ --no-server-headers \
+ --no-date-headers
elif [ "$SERVER_DEPLOYMENT_TYPE" = "test" ]; then
- uvicorn devourer:app --host 0.0.0.0 --port 80 --ssl-certfile /certs/server.cert --ssl-keyfile /certs/server.key
+ uvicorn devourer:app \
+ --host 0.0.0.0 \
+ --port 80 \
+ --ssl-certfile /certs/server.cert \
+ --ssl-keyfile /certs/server.key \
+ --no-proxy-headers \
+ --no-server-headers \
+ --no-date-headers
fi
diff --git a/pyproject.toml b/pyproject.toml
index 95b9f5d..d0fa291 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,12 +1,12 @@
[tool.poetry]
name = "devourer"
-version = "0.1.0"
+version = "1.0.0"
description = ""
authors = ["terminaldweller <thabogre@gmail.com>"]
license = "GPL3.0"
[tool.poetry.dependencies]
-python = "^3.8"
+python = "3.8"
newspaper3k = "^0.2.8"
beautifulsoup4 = "^4.9.3"
readability-lxml = "^0.8.1"