diff options
| -rwxr-xr-x | cargo/cargo.py | 2 | ||||
| -rw-r--r-- | devourer/devourer.py | 56 | ||||
| -rw-r--r-- | docker-compose-test.yaml | 7 | ||||
| -rw-r--r-- | docker-compose.yaml | 3 | 
4 files changed, 46 insertions, 22 deletions
diff --git a/cargo/cargo.py b/cargo/cargo.py index a819a30..821e235 100755 --- a/cargo/cargo.py +++ b/cargo/cargo.py @@ -8,7 +8,7 @@ import os  # https://huggingface.co/docs/huggingface_hub/how-to-downstream  def download(path: str = ".") -> None:      bart_pretrained = hh.hf_hub_url( -        "lysandre/arxiv-nlp", filename="config.json" +        "sshleifer/distilbart-cnn-12-6", filename="config.json"      )      hh.cached_download(bart_pretrained) diff --git a/devourer/devourer.py b/devourer/devourer.py index f6b0520..29c63da 100644 --- a/devourer/devourer.py +++ b/devourer/devourer.py @@ -181,23 +181,43 @@ def pdfToText(url: str) -> str:  # FIXME doesnt work for long texts  def summarizeText(text: str) -> str:      """Summarize the given text using bart.""" +    result = str +    # TODO move me later +    transformers_summarizer = transformers.pipeline("summarization") +    try: +        sentences = text.split(".") +        current_chunk = 0 +        max_chunk = 500 +        chunks = [] + +        for sentence in sentences: +            if len(chunks) == current_chunk + 1: +                if ( +                    len(chunks[current_chunk]) + len(sentence.split(" ")) +                    <= max_chunk +                ): +                    chunks[current_chunk].extend(sentence.split(" ")) +                else: +                    current_chunk = +1 +                    chunks.append(sentence.split(" ")) +            else: +                chunks.append(sentence.split(" ")) +        print(chunks) -    model = transformers.BartForConditionalGeneration.from_pretrained( -        "facebook/bart-large-cnn" -    ) -    tokenizer = transformers.BartTokenizer.from_pretrained( -        "facebook/bart-large-cnn" -    ) -    inputs = tokenizer([text], max_length=1024, return_tensors="pt") -    summary_ids = model.generate( -        inputs["input_ids"], num_beams=4, max_length=5, early_stopping=True -    ) -    return [ -        tokenizer.decode( -            g, skip_special_tokens=True, clean_up_tokenization_spaces=False +        for chunk_id in range(len(chunks)): +            chunks[chunk_id] = "".join(chunks[chunk_id]) +        print(chunks) + +        summaries = transformers_summarizer( +            chunks, max_length=50, min_length=30, do_sample=False          ) -        for g in summary_ids -    ] + +        result = "".join([summary["summary_text"] for summary in summaries]) +        print(result) +    except Exception as e: +        logging.exception(e) +    finally: +        return result  def summarizeText_v2(text: str) -> str: @@ -335,6 +355,8 @@ def getSentiments(detailed: bool) -> list:  app = fastapi.FastAPI() +nltk.download("punkt") +  # https://cheatsheetseries.owasp.org/cheatsheets/REST_Security_Cheat_Sheet.html  @app.middleware("http") @@ -352,10 +374,6 @@ async def addSecureHeaders(      return response -nltk.download("punkt") -# transformers_summarizer = transformers.pipeline("summarization") - -  @app.get("/mila/pdf")  def pdf_ep(      url: str, feat: str = "", audio: bool = False, summarize: bool = False diff --git a/docker-compose-test.yaml b/docker-compose-test.yaml index 0f743b7..1545132 100644 --- a/docker-compose-test.yaml +++ b/docker-compose-test.yaml @@ -18,11 +18,13 @@ services:        - WIKI_SEARCH_URL=https://en.wikipedia.org/w/api.php        - SERVER_DEPLOYMENT_TYPE=test        - TIKA_CLIENT_ONLY=True +      - TRANSFORMERS_CACHE=/huggingface/cache/      cap_drop:        - ALL      entrypoint: ["/docker-entrypoint.sh"]      volumes:        - devourer-nltk-data:/root/nltk_data/ +      - devourer-hg-cache:/huggingface/cache/    tika:      image: apache/tika:2.0.0      networks: @@ -39,9 +41,9 @@ services:        - "127.0.0.1:8080:8080"      environment:        - SERVER_PORT=8080 -      - SERVER_VAULT=/cargo-vault +      - SERVER_VAULT=/cargo-vault/      volumes: -      - cargo-vault:/cargo-vault +      - cargo-vault:/cargo-vault/      cap_drop:        - ALL      cap_add: @@ -53,3 +55,4 @@ networks:  volumes:    cargo-vault:    devourer-nltk-data: +  devourer-hg-cache: diff --git a/docker-compose.yaml b/docker-compose.yaml index b8e16e5..fd29695 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -18,11 +18,13 @@ services:        - WIKI_SEARCH_URL=https://en.wikipedia.org/w/api.php        - SERVER_DEPLOYMENT_TYPE=deployment        - TIKA_CLIENT_ONLY=True +      - TRANSFORMERS_CACHE=/huggingface/cache/      cap_drop:        - ALL      entrypoint: ["/docker-entrypoint.sh"]      volumes:        - devourer-nltk-data:/root/nltk_data/ +      - devourer-hg-cache:/huggingface/cache/    tika:      image: apache/tika:2.0.0      networks: @@ -51,3 +53,4 @@ networks:  volumes:    cargo-vault:    devourer-nltk-data: +  devourer-hg-cache:  | 
