aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Dockerfile2
-rw-r--r--devourer.py48
-rw-r--r--docker-compose-test.yaml3
-rw-r--r--docker-compose.yaml3
-rw-r--r--pyproject.toml2
-rwxr-xr-xrun.sh9
-rwxr-xr-xtests.sh7
7 files changed, 50 insertions, 24 deletions
diff --git a/Dockerfile b/Dockerfile
index 9ca872e..f8882fe 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -19,7 +19,7 @@ WORKDIR $PYSETUP_PATH
COPY ./pyproject.toml ./
RUN poetry install --no-dev
-FROM node:lts-alpine3.13 AS certbuilder
+FROM alpine:3.15 AS certbuilder
RUN apk add openssl
WORKDIR /certs
RUN openssl req -nodes -new -x509 -subj="/C=US/ST=Denial/L=springfield/O=Dis/CN=localhost" -keyout server.key -out server.cert
diff --git a/devourer.py b/devourer.py
index 249500c..995ab52 100644
--- a/devourer.py
+++ b/devourer.py
@@ -1,7 +1,6 @@
# _*_ coding=utf-8 _*_
import bs4
-import concurrent.futures
import contextlib
import datetime
import fastapi
@@ -15,7 +14,9 @@ import re
import readability
import requests
import string
+import tempfile
import tika
+from tika import parser as tparser
import transformers
@@ -138,6 +139,26 @@ def extractRequirements(textBody: str) -> list:
return result
+def pdfToText(url: str) -> str:
+ """Convert the PDF file to a string"""
+ tikaResult = dict()
+ try:
+ with tempfile.NamedTemporaryFile(mode="w+b", delete=True) as tmpFile:
+ tmpFile.write(simpleGet(url))
+ tikaResult = tparser.from_file(
+ tmpFile.name, serverEndpoint=os.environ["TIKA_SERVER_ENDPOINT"]
+ )
+ print(tikaResult["metadata"])
+ print(tikaResult["content"])
+ except Exception as e:
+ logging.exception(e)
+ finally:
+ if "content" in tikaResult:
+ return tikaResult["content"]
+ else:
+ return ""
+
+
def summarizeText(text: str) -> str:
"""Summarize the given text using bart."""
@@ -307,6 +328,24 @@ async def addSecureHeaders(
nltk.download("punkt")
+@app.get("/mila/pdf")
+def pdf_ep(url: str, feat: str, audio: bool = False, summarize: bool = False):
+ text = pdfToText(url)
+ if summarize:
+ text = summarizeText(text)
+ # if audio:
+ # audio_path = textToAudio(text)
+ # return fastapi.Response(
+ # getAudioFromFile(audio_path) if audio_path != "" else "",
+ # media_type="audio/mpeg",
+ # )
+ return {
+ "Content-Type": "application/json",
+ "isOk": True if text != "" else False,
+ "result": text,
+ }
+
+
@app.get("/mila/tika")
def pdf_to_audio_ep(url: str):
"""turns a pdf into an audiofile"""
@@ -387,13 +426,6 @@ def mila_ep(url: str, summary: str = "newspaper", audio: bool = False):
}
-@app.get("/mila/sentiments")
-def sentiments_endpoint(url: str, detailed: bool):
- """the sentiments endpoint"""
- sentiments = getSentiments(detailed)
- return {"Content-Type": "application/json", "Sentiments": sentiments}
-
-
@app.get("/mila/health")
def health_ep():
return {"Content-Type": "application/json", "isOK": True}
diff --git a/docker-compose-test.yaml b/docker-compose-test.yaml
index 7fe0ea2..3a85a11 100644
--- a/docker-compose-test.yaml
+++ b/docker-compose-test.yaml
@@ -12,10 +12,11 @@ services:
ports:
- "19019:80"
environment:
- - TIKA_SERVER_ENDPOINT=tika:9998
+ - TIKA_SERVER_ENDPOINT=http://tika:9998
- AUDIO_DUMP_DIR=/tmp
- WIKI_SEARCH_URL=https://en.wikipedia.org/w/api.php
- SERVER_DEPLOYMENT_TYPE=test
+ - TIKA_CLIENT_ONLY=True
cap_drop:
- ALL
entrypoint: ["/docker-entrypoint.sh"]
diff --git a/docker-compose.yaml b/docker-compose.yaml
index 24d43a7..bbbd0d1 100644
--- a/docker-compose.yaml
+++ b/docker-compose.yaml
@@ -12,10 +12,11 @@ services:
ports:
- "9009:80"
environment:
- - TIKA_SERVER_ENDPOINT=tika:9998
+ - TIKA_SERVER_ENDPOINT=http://tika:9998
- AUDIO_DUMP_DIR=/tmp
- WIKI_SEARCH_URL=https://en.wikipedia.org/w/api.php
- SERVER_DEPLOYMENT_TYPE=deployment
+ - TIKA_CLIENT_ONLY=True
cap_drop:
- ALL
entrypoint: ["/docker-entrypoint.sh"]
diff --git a/pyproject.toml b/pyproject.toml
index d0fa291..82a7025 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -6,7 +6,7 @@ authors = ["terminaldweller <thabogre@gmail.com>"]
license = "GPL3.0"
[tool.poetry.dependencies]
-python = "3.8"
+python = "^3.8"
newspaper3k = "^0.2.8"
beautifulsoup4 = "^4.9.3"
readability-lxml = "^0.8.1"
diff --git a/run.sh b/run.sh
deleted file mode 100755
index be3cd55..0000000
--- a/run.sh
+++ /dev/null
@@ -1,9 +0,0 @@
-#!/usr/bin/env sh
-set -e
-set -x
-
-# sniff --src https://github.com/coinpride/CryptoList --url | ./main.py > out2.txt
-./main.py --source https://github.com/coinpride/CryptoList > out.html
-# ./main.py --source https://techurls.com/ > out.txt
-# cat out2.txt| pico2wave --wave=out2.wav
-# ./main.py --source http://blog.terminaldweller.com > out.txt
diff --git a/tests.sh b/tests.sh
index 0fdd19d..0960127 100755
--- a/tests.sh
+++ b/tests.sh
@@ -1,5 +1,6 @@
#!/usr/bin/env sh
-curl -k -X GET https://localhost:19019/mila/summ?url=https://dilipkumar.medium.com/standalone-mongodb-on-kubernetes-cluster-19e7b5896b27&summary=newspaper&audio=true
-curl -k -X GET https://localhost:19019/mila/wiki?term=iommu&summary=none&audio=false
-curl -k -X GET https://localhost:19019/mila/reqs?url=https://www.ietf.org/rfc/rfc2865.txt&sourcetype=text
+curl -k -X GET "https://localhost:19019/mila/summ?url=https://dilipkumar.medium.com/standalone-mongodb-on-kubernetes-cluster-19e7b5896b27&summary=newspaper&audio=true"
+curl -k -X GET "https://localhost:19019/mila/wiki?term=iommu&summary=none&audio=false"
+curl -k -X GET "https://localhost:19019/mila/reqs?url=https://www.ietf.org/rfc/rfc2865.txt&sourcetype=text"
+curl -k -X GET "https://localhost:19019/mila/pdf?feat=gaga&url=https://www.rroij.com/open-access/mutation-testing-a-review-33-36.pdf"