diff options
Diffstat (limited to '')
| -rw-r--r-- | Dockerfile | 2 | ||||
| -rw-r--r-- | devourer.py | 48 | ||||
| -rw-r--r-- | docker-compose-test.yaml | 3 | ||||
| -rw-r--r-- | docker-compose.yaml | 3 | ||||
| -rw-r--r-- | pyproject.toml | 2 | ||||
| -rwxr-xr-x | run.sh | 9 | ||||
| -rwxr-xr-x | tests.sh | 7 | 
7 files changed, 50 insertions, 24 deletions
| @@ -19,7 +19,7 @@ WORKDIR $PYSETUP_PATH  COPY ./pyproject.toml ./  RUN poetry install --no-dev -FROM node:lts-alpine3.13 AS certbuilder +FROM alpine:3.15 AS certbuilder  RUN apk add openssl  WORKDIR /certs  RUN openssl req -nodes -new -x509 -subj="/C=US/ST=Denial/L=springfield/O=Dis/CN=localhost" -keyout server.key -out server.cert diff --git a/devourer.py b/devourer.py index 249500c..995ab52 100644 --- a/devourer.py +++ b/devourer.py @@ -1,7 +1,6 @@  # _*_ coding=utf-8 _*_  import bs4 -import concurrent.futures  import contextlib  import datetime  import fastapi @@ -15,7 +14,9 @@ import re  import readability  import requests  import string +import tempfile  import tika +from tika import parser as tparser  import transformers @@ -138,6 +139,26 @@ def extractRequirements(textBody: str) -> list:      return result +def pdfToText(url: str) -> str: +    """Convert the PDF file to a string""" +    tikaResult = dict() +    try: +        with tempfile.NamedTemporaryFile(mode="w+b", delete=True) as tmpFile: +            tmpFile.write(simpleGet(url)) +            tikaResult = tparser.from_file( +                tmpFile.name, serverEndpoint=os.environ["TIKA_SERVER_ENDPOINT"] +            ) +            print(tikaResult["metadata"]) +            print(tikaResult["content"]) +    except Exception as e: +        logging.exception(e) +    finally: +        if "content" in tikaResult: +            return tikaResult["content"] +        else: +            return "" + +  def summarizeText(text: str) -> str:      """Summarize the given text using bart.""" @@ -307,6 +328,24 @@ async def addSecureHeaders(  nltk.download("punkt") +@app.get("/mila/pdf") +def pdf_ep(url: str, feat: str, audio: bool = False, summarize: bool = False): +    text = pdfToText(url) +    if summarize: +        text = summarizeText(text) +    # if audio: +    #     audio_path = textToAudio(text) +    # return fastapi.Response( +    #     getAudioFromFile(audio_path) if audio_path != "" else "", +    #     media_type="audio/mpeg", +    # ) +    return { +        "Content-Type": "application/json", +        "isOk": True if text != "" else False, +        "result": text, +    } + +  @app.get("/mila/tika")  def pdf_to_audio_ep(url: str):      """turns a pdf into an audiofile""" @@ -387,13 +426,6 @@ def mila_ep(url: str, summary: str = "newspaper", audio: bool = False):          } -@app.get("/mila/sentiments") -def sentiments_endpoint(url: str, detailed: bool): -    """the sentiments endpoint""" -    sentiments = getSentiments(detailed) -    return {"Content-Type": "application/json", "Sentiments": sentiments} - -  @app.get("/mila/health")  def health_ep():      return {"Content-Type": "application/json", "isOK": True} diff --git a/docker-compose-test.yaml b/docker-compose-test.yaml index 7fe0ea2..3a85a11 100644 --- a/docker-compose-test.yaml +++ b/docker-compose-test.yaml @@ -12,10 +12,11 @@ services:      ports:        - "19019:80"      environment: -      - TIKA_SERVER_ENDPOINT=tika:9998 +      - TIKA_SERVER_ENDPOINT=http://tika:9998        - AUDIO_DUMP_DIR=/tmp        - WIKI_SEARCH_URL=https://en.wikipedia.org/w/api.php        - SERVER_DEPLOYMENT_TYPE=test +      - TIKA_CLIENT_ONLY=True      cap_drop:        - ALL      entrypoint: ["/docker-entrypoint.sh"] diff --git a/docker-compose.yaml b/docker-compose.yaml index 24d43a7..bbbd0d1 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -12,10 +12,11 @@ services:      ports:        - "9009:80"      environment: -      - TIKA_SERVER_ENDPOINT=tika:9998 +      - TIKA_SERVER_ENDPOINT=http://tika:9998        - AUDIO_DUMP_DIR=/tmp        - WIKI_SEARCH_URL=https://en.wikipedia.org/w/api.php        - SERVER_DEPLOYMENT_TYPE=deployment +      - TIKA_CLIENT_ONLY=True      cap_drop:        - ALL      entrypoint: ["/docker-entrypoint.sh"] diff --git a/pyproject.toml b/pyproject.toml index d0fa291..82a7025 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,7 +6,7 @@ authors = ["terminaldweller <thabogre@gmail.com>"]  license = "GPL3.0"  [tool.poetry.dependencies] -python = "3.8" +python = "^3.8"  newspaper3k = "^0.2.8"  beautifulsoup4 = "^4.9.3"  readability-lxml = "^0.8.1" @@ -1,9 +0,0 @@ -#!/usr/bin/env sh -set -e -set -x - -# sniff --src https://github.com/coinpride/CryptoList --url | ./main.py > out2.txt -./main.py --source https://github.com/coinpride/CryptoList > out.html -# ./main.py --source https://techurls.com/ > out.txt -# cat out2.txt| pico2wave --wave=out2.wav -# ./main.py --source http://blog.terminaldweller.com > out.txt @@ -1,5 +1,6 @@  #!/usr/bin/env sh -curl -k -X GET https://localhost:19019/mila/summ?url=https://dilipkumar.medium.com/standalone-mongodb-on-kubernetes-cluster-19e7b5896b27&summary=newspaper&audio=true -curl -k -X GET https://localhost:19019/mila/wiki?term=iommu&summary=none&audio=false -curl -k -X GET https://localhost:19019/mila/reqs?url=https://www.ietf.org/rfc/rfc2865.txt&sourcetype=text +curl -k -X GET "https://localhost:19019/mila/summ?url=https://dilipkumar.medium.com/standalone-mongodb-on-kubernetes-cluster-19e7b5896b27&summary=newspaper&audio=true" +curl -k -X GET "https://localhost:19019/mila/wiki?term=iommu&summary=none&audio=false" +curl -k -X GET "https://localhost:19019/mila/reqs?url=https://www.ietf.org/rfc/rfc2865.txt&sourcetype=text" +curl -k -X GET "https://localhost:19019/mila/pdf?feat=gaga&url=https://www.rroij.com/open-access/mutation-testing-a-review-33-36.pdf" | 
