From 2535f836d49f1db32b4413e459c7acdefe97a434 Mon Sep 17 00:00:00 2001 From: terminaldweller Date: Wed, 25 May 2022 23:23:09 +0430 Subject: bug fixes and updates --- devourer/Dockerfile | 3 +- devourer/devourer.py | 73 +++++++++++++++++++++++++++++++++++------------- devourer/poetry.lock | 15 +++++++++- docker-compose-test.yaml | 3 ++ docker-compose.yaml | 7 ++--- 5 files changed, 75 insertions(+), 26 deletions(-) diff --git a/devourer/Dockerfile b/devourer/Dockerfile index f8882fe..3b4bc94 100644 --- a/devourer/Dockerfile +++ b/devourer/Dockerfile @@ -22,9 +22,10 @@ RUN poetry install --no-dev FROM alpine:3.15 AS certbuilder RUN apk add openssl WORKDIR /certs -RUN openssl req -nodes -new -x509 -subj="/C=US/ST=Denial/L=springfield/O=Dis/CN=localhost" -keyout server.key -out server.cert +RUN openssl req -nodes -new -x509 -subj="/C=US/ST=Denial/L=springfield/O=Dis/CN=mila.terminaldweller.com" -keyout server.key -out server.cert FROM python-base as production +RUN apt update && apt install --no-install-recommends -y poppler-utils COPY --from=certbuilder /certs/ /certs ENV FASTAPI_ENV=production COPY --from=builder-base $VENV_PATH $VENV_PATH diff --git a/devourer/devourer.py b/devourer/devourer.py index e754538..31ab87f 100644 --- a/devourer/devourer.py +++ b/devourer/devourer.py @@ -99,6 +99,13 @@ def configNews(config: newspaper.Config) -> None: config.browser_user_agent = "Chrome/91.0.4464.5" +def sanitizeText(text: str) -> str: + text = text.replace("\n", "") + text = text.replace("\n\r", "") + text = text.replace('"', "") + return text + + # FIXME-have to decide whether to use files or urls def pdfToVoice() -> str: """Main function for converting a pdf to an mp3.""" @@ -135,10 +142,23 @@ def extractRequirements(textBody: str) -> list: for sentence in sentences: for keyword in REQ_KEYWORDS: if sentence.casefold().find(keyword) >= 0: - result.append(sentence) + result.append(sanitizeText(sentence)) return result +def extractRefs(url: str) -> list: + import refextract + + refs = list() + try: + refs = refextract.extract_references_from_url(url) + return refs + except Exception as e: + logging.exception(e) + finally: + return refs + + def pdfToText(url: str) -> str: """Convert the PDF file to a string""" tikaResult = dict() @@ -148,13 +168,13 @@ def pdfToText(url: str) -> str: tikaResult = tparser.from_file( tmpFile.name, serverEndpoint=os.environ["TIKA_SERVER_ENDPOINT"] ) - print(tikaResult["metadata"]) - print(tikaResult["content"]) + # print(tikaResult["metadata"]) + # print(tikaResult["content"]) except Exception as e: logging.exception(e) finally: if "content" in tikaResult: - return tikaResult["content"] + return sanitizeText(tikaResult["content"]) else: return "" @@ -245,6 +265,8 @@ def summarizeLinkToAudio(url, summary) -> str: result = article.text else: print("invalid option for summary type.") + if result != "": + result = sanitizeText(result) except Exception as e: logging.exception(e) finally: @@ -287,6 +309,7 @@ def searchWikipedia(search_term: str, summary: str) -> str: # FIXME-handle wiki redirects/disambiguations source = res[3][0] result = summarizeLinkToAudio(source, summary) + result = sanitizeText(result) except Exception as e: logging.exception(e) finally: @@ -331,25 +354,35 @@ async def addSecureHeaders( nltk.download("punkt") -transformers_summarizer = transformers.pipeline("summarization") +# transformers_summarizer = transformers.pipeline("summarization") @app.get("/mila/pdf") -def pdf_ep(url: str, feat: str, audio: bool = False, summarize: bool = False): - text = pdfToText(url) - if summarize: - text = summarizeText(text) - # if audio: - # audio_path = textToAudio(text) - # return fastapi.Response( - # getAudioFromFile(audio_path) if audio_path != "" else "", - # media_type="audio/mpeg", - # ) - return { - "Content-Type": "application/json", - "isOk": True if text != "" else False, - "result": text, - } +def pdf_ep( + url: str, feat: str = "", audio: bool = False, summarize: bool = False +): + if feat == "": + text = pdfToText(url) + if summarize: + text = summarizeText(text) + if audio: + audio_path = textToAudio(text) + return fastapi.Response( + getAudioFromFile(audio_path) if audio_path != "" else "", + media_type="audio/mpeg", + ) + return { + "Content-Type": "application/json", + "isOk": True if text != "" else False, + "result": text, + } + elif feat == "refs": + refs = extractRefs(url) + return { + "Content-Type": "application/json", + "isOk": True if refs is not None else False, + "result": refs, + } @app.get("/mila/tika") diff --git a/devourer/poetry.lock b/devourer/poetry.lock index 55df0df..ce2a8e6 100644 --- a/devourer/poetry.lock +++ b/devourer/poetry.lock @@ -662,6 +662,14 @@ category = "main" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" +[[package]] +name = "python-magic-bin" +version = "0.4.14" +description = "File type identification using libmagic binary package" +category = "main" +optional = false +python-versions = "*" + [[package]] name = "pytz" version = "2022.1" @@ -1149,7 +1157,7 @@ testing = ["pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-flake8", "pytest- [metadata] lock-version = "1.1" python-versions = "^3.8" -content-hash = "f7a64a6a1da08676ad0effa163082f02ff87d65ae0e3f4a3f6a2e87607ee1cec" +content-hash = "a740bd5805861994b28f7a187d06be052f26dd920355f6624955134e62cb6059" [metadata.files] absl-py = [ @@ -1634,6 +1642,11 @@ python-magic = [ {file = "python-magic-0.4.25.tar.gz", hash = "sha256:21f5f542aa0330f5c8a64442528542f6215c8e18d2466b399b0d9d39356d83fc"}, {file = "python_magic-0.4.25-py2.py3-none-any.whl", hash = "sha256:1a2c81e8f395c744536369790bd75094665e9644110a6623bcc3bbea30f03973"}, ] +python-magic-bin = [ + {file = "python_magic_bin-0.4.14-py2.py3-none-macosx_10_6_intel.whl", hash = "sha256:7b1743b3dbf16601d6eedf4e7c2c9a637901b0faaf24ad4df4d4527e7d8f66a4"}, + {file = "python_magic_bin-0.4.14-py2.py3-none-win32.whl", hash = "sha256:34a788c03adde7608028203e2dbb208f1f62225ad91518787ae26d603ae68892"}, + {file = "python_magic_bin-0.4.14-py2.py3-none-win_amd64.whl", hash = "sha256:90be6206ad31071a36065a2fc169c5afb5e0355cbe6030e87641c6c62edc2b69"}, +] pytz = [ {file = "pytz-2022.1-py2.py3-none-any.whl", hash = "sha256:e68985985296d9a66a881eb3193b0906246245294a881e7c8afe623866ac6a5c"}, {file = "pytz-2022.1.tar.gz", hash = "sha256:1e760e2fe6a8163bc0b3d9a19c4f84342afa0a2affebfaa84b01b978a02ecaa7"}, diff --git a/docker-compose-test.yaml b/docker-compose-test.yaml index 102ce34..0f743b7 100644 --- a/docker-compose-test.yaml +++ b/docker-compose-test.yaml @@ -21,6 +21,8 @@ services: cap_drop: - ALL entrypoint: ["/docker-entrypoint.sh"] + volumes: + - devourer-nltk-data:/root/nltk_data/ tika: image: apache/tika:2.0.0 networks: @@ -50,3 +52,4 @@ networks: tikanet: volumes: cargo-vault: + devourer-nltk-data: diff --git a/docker-compose.yaml b/docker-compose.yaml index b7eb5f6..b8e16e5 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -21,6 +21,8 @@ services: cap_drop: - ALL entrypoint: ["/docker-entrypoint.sh"] + volumes: + - devourer-nltk-data:/root/nltk_data/ tika: image: apache/tika:2.0.0 networks: @@ -41,10 +43,6 @@ services: cap_drop: - ALL cap_add: - - CHOWN - - DAC_OVERRIDE - - SETGID - - SETUID - NET_BIND_SERVICE entrypoint: ["/cargo/cargo.py"] networks: @@ -52,3 +50,4 @@ networks: tikanet: volumes: cargo-vault: + devourer-nltk-data: -- cgit v1.2.3