added keyword extraction feature, some pylint cleanup

author: terminaldweller <thabogre@gmail.com> 2022-06-14 16:57:25 +0000
committer: terminaldweller <thabogre@gmail.com> 2022-06-14 16:57:25 +0000
commit: 43d266146a18466fb842e0637f3351a2eaef38c7 (patch)
tree: a5ee7984827ed9f28f39a92cb63ae9e1e189c490
parent: added a summarization that works (diff)
download: devourer-43d266146a18466fb842e0637f3351a2eaef38c7.tar.gz
devourer-43d266146a18466fb842e0637f3351a2eaef38c7.zip
4 files changed, 97 insertions, 66 deletions
diff --git a/devourer/devourer.py b/devourer/devourer.py
index 29d4506..7ee2fb1 100644
--- a/devourer/devourer.py
+++ b/devourer/devourer.py
@@ -15,6 +15,7 @@ import fastapi
 import gtts  # type:ignore
 import newspaper  # type:ignore
 import nltk  # type:ignore
+import rake_nltk  # type:ignore
 import readability  # type:ignore
 import refextract  # type:ignore
 import requests
@@ -24,42 +25,42 @@ from tika import parser as tparser
 
 
 # FIXME-maybe actually really do some logging
-def logError(err: str) -> None:
+def log_error(err: str) -> None:
     """Logs the errors."""
     logging.exception(err)
 
 
-def isAGoodResponse(resp: requests.Response) -> bool:
+def is_a_good_response(resp: requests.Response) -> bool:
     """Checks whether the get we sent got a 200 response."""
     content_type = resp.headers["Content-Type"].lower()
     return resp.status_code == 200 and content_type is not None
 
 
-def simpleGet(url: str) -> bytes:
+def simple_get(url: str) -> bytes:
     """Issues a simple get request."""
     content = bytes()
     try:
         with contextlib.closing(requests.get(url, stream=True)) as resp:
-            if isAGoodResponse(resp):
+            if is_a_good_response(resp):
                 content = resp.content
     except requests.exceptions.RequestException as e:
-        logError("Error during requests to {0} : {1}".format(url, str(e)))
+        log_error("Error during requests to {0} : {1}".format(url, str(e)))
     finally:
         return content
 
 
-def getWithParams(url: str, params: dict) -> typing.Optional[dict]:
+def get_with_params(url: str, params: dict) -> typing.Optional[dict]:
     """Issues a get request with params."""
     try:
         with contextlib.closing(
             requests.get(url, params=params, stream=True)
         ) as resp:
-            if isAGoodResponse(resp):
+            if is_a_good_response(resp):
                 return resp.json()
             else:
                 return None
     except requests.exceptions.RequestException as e:
-        logError("Error during requests to {0} : {1}".format(url, str(e)))
+        log_error("Error during requests to {0} : {1}".format(url, str(e)))
         return None
 
 
@@ -70,8 +71,8 @@ def getRandStr(n):
 
 def getURLS(source: str, summary: str) -> dict:
     """Extracts the urls from a website."""
-    result = dict()
-    raw_ml = simpleGet(source)
+    result = {}
+    raw_ml = simple_get(source)
     ml = bs4.BeautifulSoup(raw_ml, "lxml")
 
     rand_tmp = "/tmp/" + getRandStr(20)
@@ -94,7 +95,7 @@ def getURLS(source: str, summary: str) -> dict:
     return result
 
 
-def configNews(config: newspaper.Config) -> None:
+def config_news(config: newspaper.Config) -> None:
     """Configures newspaper."""
     config.fetch_images = False
     config.keep_article_html = True
@@ -102,7 +103,7 @@ def configNews(config: newspaper.Config) -> None:
     config.browser_user_agent = "Chrome/91.0.4464.5"
 
 
-def sanitizeText(text: str) -> str:
+def sanitize_text(text: str) -> str:
     """Sanitize the strings."""
     text = text.replace("\n", "")
     text = text.replace("\n\r", "")
@@ -111,7 +112,7 @@ def sanitizeText(text: str) -> str:
 
 
 # FIXME-have to decide whether to use files or urls
-def pdfToVoice() -> str:
+def pdf_to_voice() -> str:
     """Main function for converting a pdf to an mp3."""
     outfile = str()
     try:
@@ -145,13 +146,13 @@ def extractRequirements(textBody: str) -> list:
     for sentence in sentences:
         for keyword in REQ_KEYWORDS:
             if sentence.casefold().find(keyword) >= 0:
-                result.append(sanitizeText(sentence))
+                result.append(sanitize_text(sentence))
     return result
 
 
-def extractRefs(url: str) -> list:
+def extract_refs(url: str) -> list:
     """Extract the references from an article."""
-    refs = list()
+    refs = []
     try:
         refs = refextract.extract_references_from_url(url)
         return refs
@@ -161,12 +162,12 @@ def extractRefs(url: str) -> list:
         return refs
 
 
-def pdfToText(url: str) -> str:
+def pdf_to_text(url: str) -> str:
     """Convert the PDF file to a string."""
-    tikaResult = dict()
+    tikaResult = {}
     try:
         with tempfile.NamedTemporaryFile(mode="w+b", delete=True) as tmpFile:
-            content = simpleGet(url)
+            content = simple_get(url)
             if content is not None:
                 tmpFile.write(content)
                 tikaResult = tparser.from_file(
@@ -179,13 +180,13 @@ def pdfToText(url: str) -> str:
         logging.exception(e)
     finally:
         if "content" in tikaResult:
-            return sanitizeText(tikaResult["content"])
+            return sanitize_text(tikaResult["content"])
         else:
             return ""
 
 
-# FIXME doesnt work for long texts
-def summarizeText(text: str) -> str:
+# TODO-very performance-intensive
+def summarize_text(text: str) -> str:
     """Summarize the given text using bart."""
     result = str()
     # TODO move me later
@@ -226,7 +227,7 @@ def summarizeText(text: str) -> str:
         return result
 
 
-def summarizeText_v2(text: str) -> str:
+def summarize_text_v2(text: str) -> str:
     """Text summarization using nltk."""
     stop_words = set(nltk.corpus.stopwords.words("english"))
     words = nltk.tokenize.word_tokenize(text)
@@ -267,7 +268,7 @@ def summarizeText_v2(text: str) -> str:
     return summary
 
 
-def textToAudio(text: str) -> str:
+def text_to_audio(text: str) -> str:
     """Transform the given text into audio."""
     path = str()
     try:
@@ -284,7 +285,7 @@ def textToAudio(text: str) -> str:
 def getRequirements(url: str, sourcetype: str) -> list:
     """Runs the single-link main function."""
     result = str()
-    results = list()
+    results = []
     try:
         if sourcetype == "html":
             parser = newspaper.build(url)
@@ -299,7 +300,7 @@ def getRequirements(url: str, sourcetype: str) -> list:
                 # results = extractRequirements(doc.summary())
                 results = extractRequirements(doc)
         elif sourcetype == "text":
-            bytesText = simpleGet(url)
+            bytesText = simple_get(url)
             results = extractRequirements(bytesText.decode("utf-8"))
     except Exception as e:
         logging.exception(e)
@@ -328,7 +329,7 @@ def summarizeLinkToAudio(url: str, summary: str) -> str:
         else:
             print("invalid option for summary type.")
         if result != "":
-            result = sanitizeText(result)
+            result = sanitize_text(result)
     except Exception as e:
         logging.exception(e)
     finally:
@@ -338,11 +339,11 @@ def summarizeLinkToAudio(url: str, summary: str) -> str:
 # FIXME-change my name
 def summarizeLinksToAudio(url: str, summary: str) -> str:
     """Summarize a list of urls into audio files."""
-    results = list()
+    results = []
     result = str()
     try:
         config = newspaper.Config()
-        configNews(config)
+        config_news(config)
         urls = getURLS(url, summary)
         for url in urls:
             results.append(summarizeLinkToAudio(url, summary))
@@ -366,19 +367,19 @@ def searchWikipedia(search_term: str, summary: str) -> str:
             "limit": "10",
             "format": "json",
         }
-        res = getWithParams(os.environ["WIKI_SEARCH_URL"], searchParmas)
+        res = get_with_params(os.environ["WIKI_SEARCH_URL"], searchParmas)
         # FIXME-handle wiki redirects/disambiguations
         if res is not None:
             source = res[3][0]
             result = summarizeLinkToAudio(source, summary)
-            result = sanitizeText(result)
+            result = sanitize_text(result)
     except Exception as e:
         logging.exception(e)
     finally:
         return result
 
 
-def getAudioFromFile(audio_path: str) -> bytes:
+def get_audio_from_file(audio_path: str) -> bytes:
     """Returns the contents of a file in binary format."""
     with open(audio_path, "rb") as audio:
         return audio.read()
@@ -397,15 +398,23 @@ def getSentiments(detailed: bool) -> list:
 """
 
 
+def get_keywords_from_text(text: str) -> typing.List[str]:
+    """Extract keywords out of text."""
+    rake_nltk_var = rake_nltk.Rake()
+    rake_nltk_var.extract_keywords_from_text(text)
+    return rake_nltk_var.get_ranked_phrases()
+
+
 app = fastapi.FastAPI()
 
 nltk.download("punkt")
 nltk.download("stopwords")
+nltk.download("wordnet")
 
 
 # https://cheatsheetseries.owasp.org/cheatsheets/REST_Security_Cheat_Sheet.html
 @app.middleware("http")
-async def addSecureHeaders(
+async def add_secure_headers(
     request: fastapi.Request, call_next
 ) -> fastapi.Response:
     """Adds security headers proposed by OWASP."""
@@ -425,35 +434,50 @@ def pdf_ep(
 ):
     """The pdf manupulation endpoint."""
     if feat == "":
-        text = pdfToText(url)
+        text = pdf_to_text(url)
         if summarize:
-            text = summarizeText_v2(text)
+            text = summarize_text_v2(text)
         if audio:
-            audio_path = textToAudio(text)
+            audio_path = text_to_audio(text)
             return fastapi.Response(
-                getAudioFromFile(audio_path) if audio_path != "" else "",
+                get_audio_from_file(audio_path) if audio_path != "" else "",
                 media_type="audio/mpeg",
             )
         return {
             "Content-Type": "application/json",
-            "isOk": True if text != "" else False,
+            "isOk": bool(text),
             "result": text,
         }
     elif feat == "refs":
-        refs = extractRefs(url)
+        refs = extract_refs(url)
         return {
             "Content-Type": "application/json",
-            "isOk": True if refs is not None else False,
+            "isOk": bool(refs),
             "result": refs,
         }
+    elif feat == "keyword":
+        text = pdf_to_text(url)
+        keywords = get_keywords_from_text(text)
+        return {
+            "Content-Type": "application/json",
+            "isOk": bool(keywords),
+            "result": keywords,
+        }
+    else:
+        return {
+            "Content-Type": "application/json",
+            "isOk": False,
+            "result": "unknown feature requested",
+        }
 
 
+# TODO- currently not working
 @app.get("/mila/tika")
 def pdf_to_audio_ep(url: str):
     """Turns a pdf into an audiofile."""
-    audio_path = pdfToVoice()
+    audio_path = pdf_to_voice()
     return fastapi.Response(
-        getAudioFromFile(audio_path) if audio_path != "" else "",
+        get_audio_from_file(audio_path) if audio_path != "" else "",
         media_type="audio/mpeg",
     )
 
@@ -464,7 +488,7 @@ def extract_reqs_ep(url: str, sourcetype: str = "html"):
     result = getRequirements(url, sourcetype)
     return {
         "Content-Type": "application/json",
-        "isOK": True if result is not None else False,
+        "isOK": bool(result),
         "reqs": result,
     }
 
@@ -474,15 +498,15 @@ def wiki_search_ep(term: str, summary: str = "none", audio: bool = False):
     """Search and summarizes from wikipedia."""
     text = searchWikipedia(term, summary)
     if audio:
-        audio_path = textToAudio(text)
+        audio_path = text_to_audio(text)
         return fastapi.Response(
-            getAudioFromFile(audio_path) if audio_path != "" else "",
+            get_audio_from_file(audio_path) if audio_path != "" else "",
             media_type="audio/mpeg",
         )
     else:
         return {
             "Content-Type": "application/json",
-            "isOK": True if text != "" else False,
+            "isOK": bool(text),
             "audio": "",
             "text": text,
         }
@@ -493,16 +517,16 @@ def summarize_ep(url: str, summary: str = "none", audio: bool = False):
     """Summarize and turn the summary into audio."""
     text = summarizeLinkToAudio(url, summary)
     if audio:
-        audio_path = textToAudio(text)
+        audio_path = text_to_audio(text)
         print(audio_path)
         return fastapi.Response(
-            getAudioFromFile(audio_path) if audio_path != "" else "",
+            get_audio_from_file(audio_path) if audio_path != "" else "",
             media_type="audio/mpeg",
         )
     else:
         return {
             "Content-Type": "application/json",
-            "isOK": True if text != "" else False,
+            "isOK": bool(text),
             # "audio": "",
             "text": text,
         }
@@ -513,16 +537,16 @@ def mila_ep(url: str, summary: str = "newspaper", audio: bool = False):
     """Extract all the urls and then summarize and turn into audio."""
     text = summarizeLinksToAudio(url, summary)
     if audio:
-        audio_path = textToAudio(text)
+        audio_path = text_to_audio(text)
         print(audio_path)
         return fastapi.Response(
-            getAudioFromFile(audio_path) if audio_path != "" else "",
+            get_audio_from_file(audio_path) if audio_path != "" else "",
             media_type="audio/mpeg",
         )
     else:
         return {
             "Content-Type": "application/json",
-            "isOK": True if text != "" else False,
+            "isOK": bool(text),
             "audio": "",
             "text": text,
         }
diff --git a/devourer/poetry.lock b/devourer/poetry.lock
index ce2a8e6..be1bb56 100644
--- a/devourer/poetry.lock
+++ b/devourer/poetry.lock
@@ -663,14 +663,6 @@ optional = false
 python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
 
 [[package]]
-name = "python-magic-bin"
-version = "0.4.14"
-description = "File type identification using libmagic binary package"
-category = "main"
-optional = false
-python-versions = "*"
-
-[[package]]
 name = "pytz"
 version = "2022.1"
 description = "World timezone definitions, modern and historical"
@@ -687,6 +679,17 @@ optional = false
 python-versions = ">=3.6"
 
 [[package]]
+name = "rake-nltk"
+version = "1.0.6"
+description = "RAKE short for Rapid Automatic Keyword Extraction algorithm, is a domain independent keyword extraction algorithm which tries to determine key phrases in a body of text by analyzing the frequency of word appearance and its co-occurance with other words in the text."
+category = "main"
+optional = false
+python-versions = ">=3.6,<4.0"
+
+[package.dependencies]
+nltk = ">=3.6.2,<4.0.0"
+
+[[package]]
 name = "readability-lxml"
 version = "0.8.1"
 description = "fast html to text parser (article readability tool) with python 3 support"
@@ -1157,7 +1160,7 @@ testing = ["pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-flake8", "pytest-
 [metadata]
 lock-version = "1.1"
 python-versions = "^3.8"
-content-hash = "a740bd5805861994b28f7a187d06be052f26dd920355f6624955134e62cb6059"
+content-hash = "24d72bba62f852cba715eb9f954a00454da5f045138088d365fc27d8de2a035f"
 
 [metadata.files]
 absl-py = [
@@ -1642,11 +1645,6 @@ python-magic = [
     {file = "python-magic-0.4.25.tar.gz", hash = "sha256:21f5f542aa0330f5c8a64442528542f6215c8e18d2466b399b0d9d39356d83fc"},
     {file = "python_magic-0.4.25-py2.py3-none-any.whl", hash = "sha256:1a2c81e8f395c744536369790bd75094665e9644110a6623bcc3bbea30f03973"},
 ]
-python-magic-bin = [
-    {file = "python_magic_bin-0.4.14-py2.py3-none-macosx_10_6_intel.whl", hash = "sha256:7b1743b3dbf16601d6eedf4e7c2c9a637901b0faaf24ad4df4d4527e7d8f66a4"},
-    {file = "python_magic_bin-0.4.14-py2.py3-none-win32.whl", hash = "sha256:34a788c03adde7608028203e2dbb208f1f62225ad91518787ae26d603ae68892"},
-    {file = "python_magic_bin-0.4.14-py2.py3-none-win_amd64.whl", hash = "sha256:90be6206ad31071a36065a2fc169c5afb5e0355cbe6030e87641c6c62edc2b69"},
-]
 pytz = [
     {file = "pytz-2022.1-py2.py3-none-any.whl", hash = "sha256:e68985985296d9a66a881eb3193b0906246245294a881e7c8afe623866ac6a5c"},
     {file = "pytz-2022.1.tar.gz", hash = "sha256:1e760e2fe6a8163bc0b3d9a19c4f84342afa0a2affebfaa84b01b978a02ecaa7"},
@@ -1686,6 +1684,10 @@ pyyaml = [
     {file = "PyYAML-6.0-cp39-cp39-win_amd64.whl", hash = "sha256:b3d267842bf12586ba6c734f89d1f5b871df0273157918b0ccefa29deb05c21c"},
     {file = "PyYAML-6.0.tar.gz", hash = "sha256:68fb519c14306fec9720a2a5b45bc9f0c8d1b9c72adf45c37baedfcd949c35a2"},
 ]
+rake-nltk = [
+    {file = "rake-nltk-1.0.6.tar.gz", hash = "sha256:7813d680b2ce77b51cdac1757f801a87ff47682c9dbd2982aea3b66730346122"},
+    {file = "rake_nltk-1.0.6-py3-none-any.whl", hash = "sha256:1c1ffdb64cae8cb99d169d53a5ffa4635f1c4abd3a02c6e22d5d083136bdc5c1"},
+]
 readability-lxml = [
     {file = "readability-lxml-0.8.1.tar.gz", hash = "sha256:e51fea56b5909aaf886d307d48e79e096293255afa567b7d08bca94d25b1a4e1"},
     {file = "readability_lxml-0.8.1-py3-none-any.whl", hash = "sha256:e0d366a21b1bd6cca17de71a4e6ea16fcfaa8b0a5b4004e39e2c7eff884e6305"},
diff --git a/devourer/pyproject.toml b/devourer/pyproject.toml
index 2f79961..52b6b8c 100644
--- a/devourer/pyproject.toml
+++ b/devourer/pyproject.toml
@@ -19,6 +19,7 @@ transformers = "^4.11.2"
 fastapi = "^0.70.0"
 uvicorn = "^0.15.0"
 refextract = "^1.1.4"
+rake-nltk = "^1.0.6"
 
 [tool.poetry.dev-dependencies]
 
diff --git a/devourer/tests.sh b/devourer/tests.sh
index e673acc..4ba4ea0 100755
--- a/devourer/tests.sh
+++ b/devourer/tests.sh
@@ -1,8 +1,12 @@
 #!/usr/bin/env sh
 
 curl -k -X GET "https://localhost:19019/mila/summ?url=https://dilipkumar.medium.com/standalone-mongodb-on-kubernetes-cluster-19e7b5896b27&summary=newspaper&audio=true"
+
 curl -k -X GET "https://localhost:19019/mila/wiki?term=iommu&summary=none&audio=false"
+
 curl -k -X GET "https://localhost:19019/mila/reqs?url=https://www.ietf.org/rfc/rfc2865.txt&sourcetype=text"
+
 curl -k -X GET "https://localhost:19019/mila/pdf?feat=&url=https://www.rroij.com/open-access/mutation-testing-a-review-33-36.pdf"
 curl -k -X GET "https://localhost:19019/mila/pdf?feat=refs&url=https://www.rroij.com/open-access/mutation-testing-a-review-33-36.pdf"
+curl -k -X GET "https://localhost:19019/mila/pdf?feat=keyword&url=https://www.rroij.com/open-access/mutation-testing-a-review-33-36.pdf&summarize=true"
 curl -k -X GET "https://localhost:19019/mila/pdf?feat=&url=https://www.rroij.com/open-access/mutation-testing-a-review-33-36.pdf&summarize=true"
author	terminaldweller <thabogre@gmail.com>	2022-06-14 16:57:25 +0000
committer	terminaldweller <thabogre@gmail.com>	2022-06-14 16:57:25 +0000
commit	43d266146a18466fb842e0637f3351a2eaef38c7 (patch)
tree	a5ee7984827ed9f28f39a92cb63ae9e1e189c490
parent	added a summarization that works (diff)
download	devourer-43d266146a18466fb842e0637f3351a2eaef38c7.tar.gz devourer-43d266146a18466fb842e0637f3351a2eaef38c7.zip