From cc075d6cc859e427ecbffc1e454d70b34da2cdfb Mon Sep 17 00:00:00 2001 From: terminaldweller Date: Fri, 27 May 2022 23:35:01 +0430 Subject: wip --- cargo/cargo.py | 2 +- devourer/devourer.py | 56 ++++++++++++++++++++++++++++++++---------------- docker-compose-test.yaml | 7 ++++-- docker-compose.yaml | 3 +++ 4 files changed, 46 insertions(+), 22 deletions(-) diff --git a/cargo/cargo.py b/cargo/cargo.py index a819a30..821e235 100755 --- a/cargo/cargo.py +++ b/cargo/cargo.py @@ -8,7 +8,7 @@ import os # https://huggingface.co/docs/huggingface_hub/how-to-downstream def download(path: str = ".") -> None: bart_pretrained = hh.hf_hub_url( - "lysandre/arxiv-nlp", filename="config.json" + "sshleifer/distilbart-cnn-12-6", filename="config.json" ) hh.cached_download(bart_pretrained) diff --git a/devourer/devourer.py b/devourer/devourer.py index f6b0520..29c63da 100644 --- a/devourer/devourer.py +++ b/devourer/devourer.py @@ -181,23 +181,43 @@ def pdfToText(url: str) -> str: # FIXME doesnt work for long texts def summarizeText(text: str) -> str: """Summarize the given text using bart.""" + result = str + # TODO move me later + transformers_summarizer = transformers.pipeline("summarization") + try: + sentences = text.split(".") + current_chunk = 0 + max_chunk = 500 + chunks = [] + + for sentence in sentences: + if len(chunks) == current_chunk + 1: + if ( + len(chunks[current_chunk]) + len(sentence.split(" ")) + <= max_chunk + ): + chunks[current_chunk].extend(sentence.split(" ")) + else: + current_chunk = +1 + chunks.append(sentence.split(" ")) + else: + chunks.append(sentence.split(" ")) + print(chunks) - model = transformers.BartForConditionalGeneration.from_pretrained( - "facebook/bart-large-cnn" - ) - tokenizer = transformers.BartTokenizer.from_pretrained( - "facebook/bart-large-cnn" - ) - inputs = tokenizer([text], max_length=1024, return_tensors="pt") - summary_ids = model.generate( - inputs["input_ids"], num_beams=4, max_length=5, early_stopping=True - ) - return [ - tokenizer.decode( - g, skip_special_tokens=True, clean_up_tokenization_spaces=False + for chunk_id in range(len(chunks)): + chunks[chunk_id] = "".join(chunks[chunk_id]) + print(chunks) + + summaries = transformers_summarizer( + chunks, max_length=50, min_length=30, do_sample=False ) - for g in summary_ids - ] + + result = "".join([summary["summary_text"] for summary in summaries]) + print(result) + except Exception as e: + logging.exception(e) + finally: + return result def summarizeText_v2(text: str) -> str: @@ -335,6 +355,8 @@ def getSentiments(detailed: bool) -> list: app = fastapi.FastAPI() +nltk.download("punkt") + # https://cheatsheetseries.owasp.org/cheatsheets/REST_Security_Cheat_Sheet.html @app.middleware("http") @@ -352,10 +374,6 @@ async def addSecureHeaders( return response -nltk.download("punkt") -# transformers_summarizer = transformers.pipeline("summarization") - - @app.get("/mila/pdf") def pdf_ep( url: str, feat: str = "", audio: bool = False, summarize: bool = False diff --git a/docker-compose-test.yaml b/docker-compose-test.yaml index 0f743b7..1545132 100644 --- a/docker-compose-test.yaml +++ b/docker-compose-test.yaml @@ -18,11 +18,13 @@ services: - WIKI_SEARCH_URL=https://en.wikipedia.org/w/api.php - SERVER_DEPLOYMENT_TYPE=test - TIKA_CLIENT_ONLY=True + - TRANSFORMERS_CACHE=/huggingface/cache/ cap_drop: - ALL entrypoint: ["/docker-entrypoint.sh"] volumes: - devourer-nltk-data:/root/nltk_data/ + - devourer-hg-cache:/huggingface/cache/ tika: image: apache/tika:2.0.0 networks: @@ -39,9 +41,9 @@ services: - "127.0.0.1:8080:8080" environment: - SERVER_PORT=8080 - - SERVER_VAULT=/cargo-vault + - SERVER_VAULT=/cargo-vault/ volumes: - - cargo-vault:/cargo-vault + - cargo-vault:/cargo-vault/ cap_drop: - ALL cap_add: @@ -53,3 +55,4 @@ networks: volumes: cargo-vault: devourer-nltk-data: + devourer-hg-cache: diff --git a/docker-compose.yaml b/docker-compose.yaml index b8e16e5..fd29695 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -18,11 +18,13 @@ services: - WIKI_SEARCH_URL=https://en.wikipedia.org/w/api.php - SERVER_DEPLOYMENT_TYPE=deployment - TIKA_CLIENT_ONLY=True + - TRANSFORMERS_CACHE=/huggingface/cache/ cap_drop: - ALL entrypoint: ["/docker-entrypoint.sh"] volumes: - devourer-nltk-data:/root/nltk_data/ + - devourer-hg-cache:/huggingface/cache/ tika: image: apache/tika:2.0.0 networks: @@ -51,3 +53,4 @@ networks: volumes: cargo-vault: devourer-nltk-data: + devourer-hg-cache: -- cgit v1.2.3