aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--devourer/Dockerfile3
-rw-r--r--devourer/devourer.py73
-rw-r--r--devourer/poetry.lock15
-rw-r--r--docker-compose-test.yaml3
-rw-r--r--docker-compose.yaml7
5 files changed, 75 insertions, 26 deletions
diff --git a/devourer/Dockerfile b/devourer/Dockerfile
index f8882fe..3b4bc94 100644
--- a/devourer/Dockerfile
+++ b/devourer/Dockerfile
@@ -22,9 +22,10 @@ RUN poetry install --no-dev
FROM alpine:3.15 AS certbuilder
RUN apk add openssl
WORKDIR /certs
-RUN openssl req -nodes -new -x509 -subj="/C=US/ST=Denial/L=springfield/O=Dis/CN=localhost" -keyout server.key -out server.cert
+RUN openssl req -nodes -new -x509 -subj="/C=US/ST=Denial/L=springfield/O=Dis/CN=mila.terminaldweller.com" -keyout server.key -out server.cert
FROM python-base as production
+RUN apt update && apt install --no-install-recommends -y poppler-utils
COPY --from=certbuilder /certs/ /certs
ENV FASTAPI_ENV=production
COPY --from=builder-base $VENV_PATH $VENV_PATH
diff --git a/devourer/devourer.py b/devourer/devourer.py
index e754538..31ab87f 100644
--- a/devourer/devourer.py
+++ b/devourer/devourer.py
@@ -99,6 +99,13 @@ def configNews(config: newspaper.Config) -> None:
config.browser_user_agent = "Chrome/91.0.4464.5"
+def sanitizeText(text: str) -> str:
+ text = text.replace("\n", "")
+ text = text.replace("\n\r", "")
+ text = text.replace('"', "")
+ return text
+
+
# FIXME-have to decide whether to use files or urls
def pdfToVoice() -> str:
"""Main function for converting a pdf to an mp3."""
@@ -135,10 +142,23 @@ def extractRequirements(textBody: str) -> list:
for sentence in sentences:
for keyword in REQ_KEYWORDS:
if sentence.casefold().find(keyword) >= 0:
- result.append(sentence)
+ result.append(sanitizeText(sentence))
return result
+def extractRefs(url: str) -> list:
+ import refextract
+
+ refs = list()
+ try:
+ refs = refextract.extract_references_from_url(url)
+ return refs
+ except Exception as e:
+ logging.exception(e)
+ finally:
+ return refs
+
+
def pdfToText(url: str) -> str:
"""Convert the PDF file to a string"""
tikaResult = dict()
@@ -148,13 +168,13 @@ def pdfToText(url: str) -> str:
tikaResult = tparser.from_file(
tmpFile.name, serverEndpoint=os.environ["TIKA_SERVER_ENDPOINT"]
)
- print(tikaResult["metadata"])
- print(tikaResult["content"])
+ # print(tikaResult["metadata"])
+ # print(tikaResult["content"])
except Exception as e:
logging.exception(e)
finally:
if "content" in tikaResult:
- return tikaResult["content"]
+ return sanitizeText(tikaResult["content"])
else:
return ""
@@ -245,6 +265,8 @@ def summarizeLinkToAudio(url, summary) -> str:
result = article.text
else:
print("invalid option for summary type.")
+ if result != "":
+ result = sanitizeText(result)
except Exception as e:
logging.exception(e)
finally:
@@ -287,6 +309,7 @@ def searchWikipedia(search_term: str, summary: str) -> str:
# FIXME-handle wiki redirects/disambiguations
source = res[3][0]
result = summarizeLinkToAudio(source, summary)
+ result = sanitizeText(result)
except Exception as e:
logging.exception(e)
finally:
@@ -331,25 +354,35 @@ async def addSecureHeaders(
nltk.download("punkt")
-transformers_summarizer = transformers.pipeline("summarization")
+# transformers_summarizer = transformers.pipeline("summarization")
@app.get("/mila/pdf")
-def pdf_ep(url: str, feat: str, audio: bool = False, summarize: bool = False):
- text = pdfToText(url)
- if summarize:
- text = summarizeText(text)
- # if audio:
- # audio_path = textToAudio(text)
- # return fastapi.Response(
- # getAudioFromFile(audio_path) if audio_path != "" else "",
- # media_type="audio/mpeg",
- # )
- return {
- "Content-Type": "application/json",
- "isOk": True if text != "" else False,
- "result": text,
- }
+def pdf_ep(
+ url: str, feat: str = "", audio: bool = False, summarize: bool = False
+):
+ if feat == "":
+ text = pdfToText(url)
+ if summarize:
+ text = summarizeText(text)
+ if audio:
+ audio_path = textToAudio(text)
+ return fastapi.Response(
+ getAudioFromFile(audio_path) if audio_path != "" else "",
+ media_type="audio/mpeg",
+ )
+ return {
+ "Content-Type": "application/json",
+ "isOk": True if text != "" else False,
+ "result": text,
+ }
+ elif feat == "refs":
+ refs = extractRefs(url)
+ return {
+ "Content-Type": "application/json",
+ "isOk": True if refs is not None else False,
+ "result": refs,
+ }
@app.get("/mila/tika")
diff --git a/devourer/poetry.lock b/devourer/poetry.lock
index 55df0df..ce2a8e6 100644
--- a/devourer/poetry.lock
+++ b/devourer/poetry.lock
@@ -663,6 +663,14 @@ optional = false
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
[[package]]
+name = "python-magic-bin"
+version = "0.4.14"
+description = "File type identification using libmagic binary package"
+category = "main"
+optional = false
+python-versions = "*"
+
+[[package]]
name = "pytz"
version = "2022.1"
description = "World timezone definitions, modern and historical"
@@ -1149,7 +1157,7 @@ testing = ["pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-flake8", "pytest-
[metadata]
lock-version = "1.1"
python-versions = "^3.8"
-content-hash = "f7a64a6a1da08676ad0effa163082f02ff87d65ae0e3f4a3f6a2e87607ee1cec"
+content-hash = "a740bd5805861994b28f7a187d06be052f26dd920355f6624955134e62cb6059"
[metadata.files]
absl-py = [
@@ -1634,6 +1642,11 @@ python-magic = [
{file = "python-magic-0.4.25.tar.gz", hash = "sha256:21f5f542aa0330f5c8a64442528542f6215c8e18d2466b399b0d9d39356d83fc"},
{file = "python_magic-0.4.25-py2.py3-none-any.whl", hash = "sha256:1a2c81e8f395c744536369790bd75094665e9644110a6623bcc3bbea30f03973"},
]
+python-magic-bin = [
+ {file = "python_magic_bin-0.4.14-py2.py3-none-macosx_10_6_intel.whl", hash = "sha256:7b1743b3dbf16601d6eedf4e7c2c9a637901b0faaf24ad4df4d4527e7d8f66a4"},
+ {file = "python_magic_bin-0.4.14-py2.py3-none-win32.whl", hash = "sha256:34a788c03adde7608028203e2dbb208f1f62225ad91518787ae26d603ae68892"},
+ {file = "python_magic_bin-0.4.14-py2.py3-none-win_amd64.whl", hash = "sha256:90be6206ad31071a36065a2fc169c5afb5e0355cbe6030e87641c6c62edc2b69"},
+]
pytz = [
{file = "pytz-2022.1-py2.py3-none-any.whl", hash = "sha256:e68985985296d9a66a881eb3193b0906246245294a881e7c8afe623866ac6a5c"},
{file = "pytz-2022.1.tar.gz", hash = "sha256:1e760e2fe6a8163bc0b3d9a19c4f84342afa0a2affebfaa84b01b978a02ecaa7"},
diff --git a/docker-compose-test.yaml b/docker-compose-test.yaml
index 102ce34..0f743b7 100644
--- a/docker-compose-test.yaml
+++ b/docker-compose-test.yaml
@@ -21,6 +21,8 @@ services:
cap_drop:
- ALL
entrypoint: ["/docker-entrypoint.sh"]
+ volumes:
+ - devourer-nltk-data:/root/nltk_data/
tika:
image: apache/tika:2.0.0
networks:
@@ -50,3 +52,4 @@ networks:
tikanet:
volumes:
cargo-vault:
+ devourer-nltk-data:
diff --git a/docker-compose.yaml b/docker-compose.yaml
index b7eb5f6..b8e16e5 100644
--- a/docker-compose.yaml
+++ b/docker-compose.yaml
@@ -21,6 +21,8 @@ services:
cap_drop:
- ALL
entrypoint: ["/docker-entrypoint.sh"]
+ volumes:
+ - devourer-nltk-data:/root/nltk_data/
tika:
image: apache/tika:2.0.0
networks:
@@ -41,10 +43,6 @@ services:
cap_drop:
- ALL
cap_add:
- - CHOWN
- - DAC_OVERRIDE
- - SETGID
- - SETUID
- NET_BIND_SERVICE
entrypoint: ["/cargo/cargo.py"]
networks:
@@ -52,3 +50,4 @@ networks:
tikanet:
volumes:
cargo-vault:
+ devourer-nltk-data: