aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorterminaldweller <thabogre@gmail.com>2022-05-27 19:05:01 +0000
committerterminaldweller <thabogre@gmail.com>2022-05-27 19:05:01 +0000
commitcc075d6cc859e427ecbffc1e454d70b34da2cdfb (patch)
tree82bed88363b025c88bc3f1404ad4df5e37e2cb55
parentrefextract fix. wasnt copying libmagic over in the final docker image (diff)
downloaddevourer-cc075d6cc859e427ecbffc1e454d70b34da2cdfb.tar.gz
devourer-cc075d6cc859e427ecbffc1e454d70b34da2cdfb.zip
wip
-rwxr-xr-xcargo/cargo.py2
-rw-r--r--devourer/devourer.py56
-rw-r--r--docker-compose-test.yaml7
-rw-r--r--docker-compose.yaml3
4 files changed, 46 insertions, 22 deletions
diff --git a/cargo/cargo.py b/cargo/cargo.py
index a819a30..821e235 100755
--- a/cargo/cargo.py
+++ b/cargo/cargo.py
@@ -8,7 +8,7 @@ import os
# https://huggingface.co/docs/huggingface_hub/how-to-downstream
def download(path: str = ".") -> None:
bart_pretrained = hh.hf_hub_url(
- "lysandre/arxiv-nlp", filename="config.json"
+ "sshleifer/distilbart-cnn-12-6", filename="config.json"
)
hh.cached_download(bart_pretrained)
diff --git a/devourer/devourer.py b/devourer/devourer.py
index f6b0520..29c63da 100644
--- a/devourer/devourer.py
+++ b/devourer/devourer.py
@@ -181,23 +181,43 @@ def pdfToText(url: str) -> str:
# FIXME doesnt work for long texts
def summarizeText(text: str) -> str:
"""Summarize the given text using bart."""
+ result = str
+ # TODO move me later
+ transformers_summarizer = transformers.pipeline("summarization")
+ try:
+ sentences = text.split(".")
+ current_chunk = 0
+ max_chunk = 500
+ chunks = []
+
+ for sentence in sentences:
+ if len(chunks) == current_chunk + 1:
+ if (
+ len(chunks[current_chunk]) + len(sentence.split(" "))
+ <= max_chunk
+ ):
+ chunks[current_chunk].extend(sentence.split(" "))
+ else:
+ current_chunk = +1
+ chunks.append(sentence.split(" "))
+ else:
+ chunks.append(sentence.split(" "))
+ print(chunks)
- model = transformers.BartForConditionalGeneration.from_pretrained(
- "facebook/bart-large-cnn"
- )
- tokenizer = transformers.BartTokenizer.from_pretrained(
- "facebook/bart-large-cnn"
- )
- inputs = tokenizer([text], max_length=1024, return_tensors="pt")
- summary_ids = model.generate(
- inputs["input_ids"], num_beams=4, max_length=5, early_stopping=True
- )
- return [
- tokenizer.decode(
- g, skip_special_tokens=True, clean_up_tokenization_spaces=False
+ for chunk_id in range(len(chunks)):
+ chunks[chunk_id] = "".join(chunks[chunk_id])
+ print(chunks)
+
+ summaries = transformers_summarizer(
+ chunks, max_length=50, min_length=30, do_sample=False
)
- for g in summary_ids
- ]
+
+ result = "".join([summary["summary_text"] for summary in summaries])
+ print(result)
+ except Exception as e:
+ logging.exception(e)
+ finally:
+ return result
def summarizeText_v2(text: str) -> str:
@@ -335,6 +355,8 @@ def getSentiments(detailed: bool) -> list:
app = fastapi.FastAPI()
+nltk.download("punkt")
+
# https://cheatsheetseries.owasp.org/cheatsheets/REST_Security_Cheat_Sheet.html
@app.middleware("http")
@@ -352,10 +374,6 @@ async def addSecureHeaders(
return response
-nltk.download("punkt")
-# transformers_summarizer = transformers.pipeline("summarization")
-
-
@app.get("/mila/pdf")
def pdf_ep(
url: str, feat: str = "", audio: bool = False, summarize: bool = False
diff --git a/docker-compose-test.yaml b/docker-compose-test.yaml
index 0f743b7..1545132 100644
--- a/docker-compose-test.yaml
+++ b/docker-compose-test.yaml
@@ -18,11 +18,13 @@ services:
- WIKI_SEARCH_URL=https://en.wikipedia.org/w/api.php
- SERVER_DEPLOYMENT_TYPE=test
- TIKA_CLIENT_ONLY=True
+ - TRANSFORMERS_CACHE=/huggingface/cache/
cap_drop:
- ALL
entrypoint: ["/docker-entrypoint.sh"]
volumes:
- devourer-nltk-data:/root/nltk_data/
+ - devourer-hg-cache:/huggingface/cache/
tika:
image: apache/tika:2.0.0
networks:
@@ -39,9 +41,9 @@ services:
- "127.0.0.1:8080:8080"
environment:
- SERVER_PORT=8080
- - SERVER_VAULT=/cargo-vault
+ - SERVER_VAULT=/cargo-vault/
volumes:
- - cargo-vault:/cargo-vault
+ - cargo-vault:/cargo-vault/
cap_drop:
- ALL
cap_add:
@@ -53,3 +55,4 @@ networks:
volumes:
cargo-vault:
devourer-nltk-data:
+ devourer-hg-cache:
diff --git a/docker-compose.yaml b/docker-compose.yaml
index b8e16e5..fd29695 100644
--- a/docker-compose.yaml
+++ b/docker-compose.yaml
@@ -18,11 +18,13 @@ services:
- WIKI_SEARCH_URL=https://en.wikipedia.org/w/api.php
- SERVER_DEPLOYMENT_TYPE=deployment
- TIKA_CLIENT_ONLY=True
+ - TRANSFORMERS_CACHE=/huggingface/cache/
cap_drop:
- ALL
entrypoint: ["/docker-entrypoint.sh"]
volumes:
- devourer-nltk-data:/root/nltk_data/
+ - devourer-hg-cache:/huggingface/cache/
tika:
image: apache/tika:2.0.0
networks:
@@ -51,3 +53,4 @@ networks:
volumes:
cargo-vault:
devourer-nltk-data:
+ devourer-hg-cache: