From f8f4c9ced849e16cac82a5a3f7b4593e99a35c08 Mon Sep 17 00:00:00 2001 From: terminaldweller Date: Mon, 16 May 2022 21:48:53 +0430 Subject: update --- Dockerfile | 2 +- devourer.py | 48 ++++++++++++++++++++++++++++++++++++++++-------- docker-compose-test.yaml | 3 ++- docker-compose.yaml | 3 ++- pyproject.toml | 2 +- run.sh | 9 --------- tests.sh | 7 ++++--- 7 files changed, 50 insertions(+), 24 deletions(-) delete mode 100755 run.sh diff --git a/Dockerfile b/Dockerfile index 9ca872e..f8882fe 100644 --- a/Dockerfile +++ b/Dockerfile @@ -19,7 +19,7 @@ WORKDIR $PYSETUP_PATH COPY ./pyproject.toml ./ RUN poetry install --no-dev -FROM node:lts-alpine3.13 AS certbuilder +FROM alpine:3.15 AS certbuilder RUN apk add openssl WORKDIR /certs RUN openssl req -nodes -new -x509 -subj="/C=US/ST=Denial/L=springfield/O=Dis/CN=localhost" -keyout server.key -out server.cert diff --git a/devourer.py b/devourer.py index 249500c..995ab52 100644 --- a/devourer.py +++ b/devourer.py @@ -1,7 +1,6 @@ # _*_ coding=utf-8 _*_ import bs4 -import concurrent.futures import contextlib import datetime import fastapi @@ -15,7 +14,9 @@ import re import readability import requests import string +import tempfile import tika +from tika import parser as tparser import transformers @@ -138,6 +139,26 @@ def extractRequirements(textBody: str) -> list: return result +def pdfToText(url: str) -> str: + """Convert the PDF file to a string""" + tikaResult = dict() + try: + with tempfile.NamedTemporaryFile(mode="w+b", delete=True) as tmpFile: + tmpFile.write(simpleGet(url)) + tikaResult = tparser.from_file( + tmpFile.name, serverEndpoint=os.environ["TIKA_SERVER_ENDPOINT"] + ) + print(tikaResult["metadata"]) + print(tikaResult["content"]) + except Exception as e: + logging.exception(e) + finally: + if "content" in tikaResult: + return tikaResult["content"] + else: + return "" + + def summarizeText(text: str) -> str: """Summarize the given text using bart.""" @@ -307,6 +328,24 @@ async def addSecureHeaders( nltk.download("punkt") +@app.get("/mila/pdf") +def pdf_ep(url: str, feat: str, audio: bool = False, summarize: bool = False): + text = pdfToText(url) + if summarize: + text = summarizeText(text) + # if audio: + # audio_path = textToAudio(text) + # return fastapi.Response( + # getAudioFromFile(audio_path) if audio_path != "" else "", + # media_type="audio/mpeg", + # ) + return { + "Content-Type": "application/json", + "isOk": True if text != "" else False, + "result": text, + } + + @app.get("/mila/tika") def pdf_to_audio_ep(url: str): """turns a pdf into an audiofile""" @@ -387,13 +426,6 @@ def mila_ep(url: str, summary: str = "newspaper", audio: bool = False): } -@app.get("/mila/sentiments") -def sentiments_endpoint(url: str, detailed: bool): - """the sentiments endpoint""" - sentiments = getSentiments(detailed) - return {"Content-Type": "application/json", "Sentiments": sentiments} - - @app.get("/mila/health") def health_ep(): return {"Content-Type": "application/json", "isOK": True} diff --git a/docker-compose-test.yaml b/docker-compose-test.yaml index 7fe0ea2..3a85a11 100644 --- a/docker-compose-test.yaml +++ b/docker-compose-test.yaml @@ -12,10 +12,11 @@ services: ports: - "19019:80" environment: - - TIKA_SERVER_ENDPOINT=tika:9998 + - TIKA_SERVER_ENDPOINT=http://tika:9998 - AUDIO_DUMP_DIR=/tmp - WIKI_SEARCH_URL=https://en.wikipedia.org/w/api.php - SERVER_DEPLOYMENT_TYPE=test + - TIKA_CLIENT_ONLY=True cap_drop: - ALL entrypoint: ["/docker-entrypoint.sh"] diff --git a/docker-compose.yaml b/docker-compose.yaml index 24d43a7..bbbd0d1 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -12,10 +12,11 @@ services: ports: - "9009:80" environment: - - TIKA_SERVER_ENDPOINT=tika:9998 + - TIKA_SERVER_ENDPOINT=http://tika:9998 - AUDIO_DUMP_DIR=/tmp - WIKI_SEARCH_URL=https://en.wikipedia.org/w/api.php - SERVER_DEPLOYMENT_TYPE=deployment + - TIKA_CLIENT_ONLY=True cap_drop: - ALL entrypoint: ["/docker-entrypoint.sh"] diff --git a/pyproject.toml b/pyproject.toml index d0fa291..82a7025 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,7 +6,7 @@ authors = ["terminaldweller "] license = "GPL3.0" [tool.poetry.dependencies] -python = "3.8" +python = "^3.8" newspaper3k = "^0.2.8" beautifulsoup4 = "^4.9.3" readability-lxml = "^0.8.1" diff --git a/run.sh b/run.sh deleted file mode 100755 index be3cd55..0000000 --- a/run.sh +++ /dev/null @@ -1,9 +0,0 @@ -#!/usr/bin/env sh -set -e -set -x - -# sniff --src https://github.com/coinpride/CryptoList --url | ./main.py > out2.txt -./main.py --source https://github.com/coinpride/CryptoList > out.html -# ./main.py --source https://techurls.com/ > out.txt -# cat out2.txt| pico2wave --wave=out2.wav -# ./main.py --source http://blog.terminaldweller.com > out.txt diff --git a/tests.sh b/tests.sh index 0fdd19d..0960127 100755 --- a/tests.sh +++ b/tests.sh @@ -1,5 +1,6 @@ #!/usr/bin/env sh -curl -k -X GET https://localhost:19019/mila/summ?url=https://dilipkumar.medium.com/standalone-mongodb-on-kubernetes-cluster-19e7b5896b27&summary=newspaper&audio=true -curl -k -X GET https://localhost:19019/mila/wiki?term=iommu&summary=none&audio=false -curl -k -X GET https://localhost:19019/mila/reqs?url=https://www.ietf.org/rfc/rfc2865.txt&sourcetype=text +curl -k -X GET "https://localhost:19019/mila/summ?url=https://dilipkumar.medium.com/standalone-mongodb-on-kubernetes-cluster-19e7b5896b27&summary=newspaper&audio=true" +curl -k -X GET "https://localhost:19019/mila/wiki?term=iommu&summary=none&audio=false" +curl -k -X GET "https://localhost:19019/mila/reqs?url=https://www.ietf.org/rfc/rfc2865.txt&sourcetype=text" +curl -k -X GET "https://localhost:19019/mila/pdf?feat=gaga&url=https://www.rroij.com/open-access/mutation-testing-a-review-33-36.pdf" -- cgit v1.2.3