diff options
-rw-r--r-- | devourer.py | 17 | ||||
-rwxr-xr-x | tests.sh | 3 |
2 files changed, 13 insertions, 7 deletions
diff --git a/devourer.py b/devourer.py index 0a64e8e..75f469b 100644 --- a/devourer.py +++ b/devourer.py @@ -131,7 +131,7 @@ def extractRequirements(textBody: str) -> list: sentences = nltk.sent_tokenize(textBody) for sentence in sentences: for keyword in REQ_KEYWORDS: - if sentence.find(keyword) >= 0: + if sentence.casefold().find(keyword) >= 0: result.append(sentence) return result @@ -183,17 +183,22 @@ def getRequirements(url: str, sourcetype: str) -> list: a = Article(article.url) a.download() a.parse() + a.nlp() doc = Document(a.html) + print(doc) # print(doc.summary()) - results = extractRequirements(doc.summary()) + # results = extractRequirements(doc.summary()) + results = extractRequirements(doc) elif sourcetype == "text": bytesText = simpleGet(url) results = extractRequirements(bytesText.decode("utf-8")) except Exception as e: logging.exception(e) finally: - result = "".join(results + "\n") - return result + print(result) + # result = "".join(results) + "\n" + # return result + return results # FIXME-summary=bart doesnt work @@ -284,10 +289,10 @@ def pdf_to_audio_ep(url: str): @app.get("/mila/reqs") def extract_reqs_ep(url: str, sourcetype: str = "html"): """extracts the requirements from a given url""" - result = getRequirements() + result = getRequirements(url, sourcetype) return { "Content-Type": "application/json", - "isOK": True if result != "" else False, + "isOK": True if result is not None else False, "reqs": result, } @@ -1,4 +1,5 @@ #!/usr/bin/env sh -curl -k -X GET https://localhost:19019/mila/summ?url=https://dilipkumar.medium.com/standalone-mongodb-on-kubernetes-cluster-19e7b5896b27&summary=newspaper&audio=false +curl -k -X GET https://localhost:19019/mila/summ?url=https://dilipkumar.medium.com/standalone-mongodb-on-kubernetes-cluster-19e7b5896b27&summary=newspaper&audio=true curl -k -X GET https://localhost:19019/mila/wiki?term=iommu +curl -k -X GET https://localhost:19019/mila/reqs?url=https://www.ietf.org/rfc/rfc2865.txt&sourcetype=text |