diff options
Diffstat (limited to 'devourer.py')
-rw-r--r-- | devourer.py | 48 |
1 files changed, 40 insertions, 8 deletions
diff --git a/devourer.py b/devourer.py index 249500c..995ab52 100644 --- a/devourer.py +++ b/devourer.py @@ -1,7 +1,6 @@ # _*_ coding=utf-8 _*_ import bs4 -import concurrent.futures import contextlib import datetime import fastapi @@ -15,7 +14,9 @@ import re import readability import requests import string +import tempfile import tika +from tika import parser as tparser import transformers @@ -138,6 +139,26 @@ def extractRequirements(textBody: str) -> list: return result +def pdfToText(url: str) -> str: + """Convert the PDF file to a string""" + tikaResult = dict() + try: + with tempfile.NamedTemporaryFile(mode="w+b", delete=True) as tmpFile: + tmpFile.write(simpleGet(url)) + tikaResult = tparser.from_file( + tmpFile.name, serverEndpoint=os.environ["TIKA_SERVER_ENDPOINT"] + ) + print(tikaResult["metadata"]) + print(tikaResult["content"]) + except Exception as e: + logging.exception(e) + finally: + if "content" in tikaResult: + return tikaResult["content"] + else: + return "" + + def summarizeText(text: str) -> str: """Summarize the given text using bart.""" @@ -307,6 +328,24 @@ async def addSecureHeaders( nltk.download("punkt") +@app.get("/mila/pdf") +def pdf_ep(url: str, feat: str, audio: bool = False, summarize: bool = False): + text = pdfToText(url) + if summarize: + text = summarizeText(text) + # if audio: + # audio_path = textToAudio(text) + # return fastapi.Response( + # getAudioFromFile(audio_path) if audio_path != "" else "", + # media_type="audio/mpeg", + # ) + return { + "Content-Type": "application/json", + "isOk": True if text != "" else False, + "result": text, + } + + @app.get("/mila/tika") def pdf_to_audio_ep(url: str): """turns a pdf into an audiofile""" @@ -387,13 +426,6 @@ def mila_ep(url: str, summary: str = "newspaper", audio: bool = False): } -@app.get("/mila/sentiments") -def sentiments_endpoint(url: str, detailed: bool): - """the sentiments endpoint""" - sentiments = getSentiments(detailed) - return {"Content-Type": "application/json", "Sentiments": sentiments} - - @app.get("/mila/health") def health_ep(): return {"Content-Type": "application/json", "isOK": True} |