diff options
| author | terminaldweller <thabogre@gmail.com> | 2022-06-14 16:57:25 +0000 | 
|---|---|---|
| committer | terminaldweller <thabogre@gmail.com> | 2022-06-14 16:57:25 +0000 | 
| commit | 43d266146a18466fb842e0637f3351a2eaef38c7 (patch) | |
| tree | a5ee7984827ed9f28f39a92cb63ae9e1e189c490 | |
| parent | added a summarization that works (diff) | |
| download | devourer-43d266146a18466fb842e0637f3351a2eaef38c7.tar.gz devourer-43d266146a18466fb842e0637f3351a2eaef38c7.zip  | |
added keyword extraction feature, some pylint cleanup
Diffstat (limited to '')
| -rw-r--r-- | devourer/devourer.py | 128 | ||||
| -rw-r--r-- | devourer/poetry.lock | 30 | ||||
| -rw-r--r-- | devourer/pyproject.toml | 1 | ||||
| -rwxr-xr-x | devourer/tests.sh | 4 | 
4 files changed, 97 insertions, 66 deletions
diff --git a/devourer/devourer.py b/devourer/devourer.py index 29d4506..7ee2fb1 100644 --- a/devourer/devourer.py +++ b/devourer/devourer.py @@ -15,6 +15,7 @@ import fastapi  import gtts  # type:ignore  import newspaper  # type:ignore  import nltk  # type:ignore +import rake_nltk  # type:ignore  import readability  # type:ignore  import refextract  # type:ignore  import requests @@ -24,42 +25,42 @@ from tika import parser as tparser  # FIXME-maybe actually really do some logging -def logError(err: str) -> None: +def log_error(err: str) -> None:      """Logs the errors."""      logging.exception(err) -def isAGoodResponse(resp: requests.Response) -> bool: +def is_a_good_response(resp: requests.Response) -> bool:      """Checks whether the get we sent got a 200 response."""      content_type = resp.headers["Content-Type"].lower()      return resp.status_code == 200 and content_type is not None -def simpleGet(url: str) -> bytes: +def simple_get(url: str) -> bytes:      """Issues a simple get request."""      content = bytes()      try:          with contextlib.closing(requests.get(url, stream=True)) as resp: -            if isAGoodResponse(resp): +            if is_a_good_response(resp):                  content = resp.content      except requests.exceptions.RequestException as e: -        logError("Error during requests to {0} : {1}".format(url, str(e))) +        log_error("Error during requests to {0} : {1}".format(url, str(e)))      finally:          return content -def getWithParams(url: str, params: dict) -> typing.Optional[dict]: +def get_with_params(url: str, params: dict) -> typing.Optional[dict]:      """Issues a get request with params."""      try:          with contextlib.closing(              requests.get(url, params=params, stream=True)          ) as resp: -            if isAGoodResponse(resp): +            if is_a_good_response(resp):                  return resp.json()              else:                  return None      except requests.exceptions.RequestException as e: -        logError("Error during requests to {0} : {1}".format(url, str(e))) +        log_error("Error during requests to {0} : {1}".format(url, str(e)))          return None @@ -70,8 +71,8 @@ def getRandStr(n):  def getURLS(source: str, summary: str) -> dict:      """Extracts the urls from a website.""" -    result = dict() -    raw_ml = simpleGet(source) +    result = {} +    raw_ml = simple_get(source)      ml = bs4.BeautifulSoup(raw_ml, "lxml")      rand_tmp = "/tmp/" + getRandStr(20) @@ -94,7 +95,7 @@ def getURLS(source: str, summary: str) -> dict:      return result -def configNews(config: newspaper.Config) -> None: +def config_news(config: newspaper.Config) -> None:      """Configures newspaper."""      config.fetch_images = False      config.keep_article_html = True @@ -102,7 +103,7 @@ def configNews(config: newspaper.Config) -> None:      config.browser_user_agent = "Chrome/91.0.4464.5" -def sanitizeText(text: str) -> str: +def sanitize_text(text: str) -> str:      """Sanitize the strings."""      text = text.replace("\n", "")      text = text.replace("\n\r", "") @@ -111,7 +112,7 @@ def sanitizeText(text: str) -> str:  # FIXME-have to decide whether to use files or urls -def pdfToVoice() -> str: +def pdf_to_voice() -> str:      """Main function for converting a pdf to an mp3."""      outfile = str()      try: @@ -145,13 +146,13 @@ def extractRequirements(textBody: str) -> list:      for sentence in sentences:          for keyword in REQ_KEYWORDS:              if sentence.casefold().find(keyword) >= 0: -                result.append(sanitizeText(sentence)) +                result.append(sanitize_text(sentence))      return result -def extractRefs(url: str) -> list: +def extract_refs(url: str) -> list:      """Extract the references from an article.""" -    refs = list() +    refs = []      try:          refs = refextract.extract_references_from_url(url)          return refs @@ -161,12 +162,12 @@ def extractRefs(url: str) -> list:          return refs -def pdfToText(url: str) -> str: +def pdf_to_text(url: str) -> str:      """Convert the PDF file to a string.""" -    tikaResult = dict() +    tikaResult = {}      try:          with tempfile.NamedTemporaryFile(mode="w+b", delete=True) as tmpFile: -            content = simpleGet(url) +            content = simple_get(url)              if content is not None:                  tmpFile.write(content)                  tikaResult = tparser.from_file( @@ -179,13 +180,13 @@ def pdfToText(url: str) -> str:          logging.exception(e)      finally:          if "content" in tikaResult: -            return sanitizeText(tikaResult["content"]) +            return sanitize_text(tikaResult["content"])          else:              return "" -# FIXME doesnt work for long texts -def summarizeText(text: str) -> str: +# TODO-very performance-intensive +def summarize_text(text: str) -> str:      """Summarize the given text using bart."""      result = str()      # TODO move me later @@ -226,7 +227,7 @@ def summarizeText(text: str) -> str:          return result -def summarizeText_v2(text: str) -> str: +def summarize_text_v2(text: str) -> str:      """Text summarization using nltk."""      stop_words = set(nltk.corpus.stopwords.words("english"))      words = nltk.tokenize.word_tokenize(text) @@ -267,7 +268,7 @@ def summarizeText_v2(text: str) -> str:      return summary -def textToAudio(text: str) -> str: +def text_to_audio(text: str) -> str:      """Transform the given text into audio."""      path = str()      try: @@ -284,7 +285,7 @@ def textToAudio(text: str) -> str:  def getRequirements(url: str, sourcetype: str) -> list:      """Runs the single-link main function."""      result = str() -    results = list() +    results = []      try:          if sourcetype == "html":              parser = newspaper.build(url) @@ -299,7 +300,7 @@ def getRequirements(url: str, sourcetype: str) -> list:                  # results = extractRequirements(doc.summary())                  results = extractRequirements(doc)          elif sourcetype == "text": -            bytesText = simpleGet(url) +            bytesText = simple_get(url)              results = extractRequirements(bytesText.decode("utf-8"))      except Exception as e:          logging.exception(e) @@ -328,7 +329,7 @@ def summarizeLinkToAudio(url: str, summary: str) -> str:          else:              print("invalid option for summary type.")          if result != "": -            result = sanitizeText(result) +            result = sanitize_text(result)      except Exception as e:          logging.exception(e)      finally: @@ -338,11 +339,11 @@ def summarizeLinkToAudio(url: str, summary: str) -> str:  # FIXME-change my name  def summarizeLinksToAudio(url: str, summary: str) -> str:      """Summarize a list of urls into audio files.""" -    results = list() +    results = []      result = str()      try:          config = newspaper.Config() -        configNews(config) +        config_news(config)          urls = getURLS(url, summary)          for url in urls:              results.append(summarizeLinkToAudio(url, summary)) @@ -366,19 +367,19 @@ def searchWikipedia(search_term: str, summary: str) -> str:              "limit": "10",              "format": "json",          } -        res = getWithParams(os.environ["WIKI_SEARCH_URL"], searchParmas) +        res = get_with_params(os.environ["WIKI_SEARCH_URL"], searchParmas)          # FIXME-handle wiki redirects/disambiguations          if res is not None:              source = res[3][0]              result = summarizeLinkToAudio(source, summary) -            result = sanitizeText(result) +            result = sanitize_text(result)      except Exception as e:          logging.exception(e)      finally:          return result -def getAudioFromFile(audio_path: str) -> bytes: +def get_audio_from_file(audio_path: str) -> bytes:      """Returns the contents of a file in binary format."""      with open(audio_path, "rb") as audio:          return audio.read() @@ -397,15 +398,23 @@ def getSentiments(detailed: bool) -> list:  """ +def get_keywords_from_text(text: str) -> typing.List[str]: +    """Extract keywords out of text.""" +    rake_nltk_var = rake_nltk.Rake() +    rake_nltk_var.extract_keywords_from_text(text) +    return rake_nltk_var.get_ranked_phrases() + +  app = fastapi.FastAPI()  nltk.download("punkt")  nltk.download("stopwords") +nltk.download("wordnet")  # https://cheatsheetseries.owasp.org/cheatsheets/REST_Security_Cheat_Sheet.html  @app.middleware("http") -async def addSecureHeaders( +async def add_secure_headers(      request: fastapi.Request, call_next  ) -> fastapi.Response:      """Adds security headers proposed by OWASP.""" @@ -425,35 +434,50 @@ def pdf_ep(  ):      """The pdf manupulation endpoint."""      if feat == "": -        text = pdfToText(url) +        text = pdf_to_text(url)          if summarize: -            text = summarizeText_v2(text) +            text = summarize_text_v2(text)          if audio: -            audio_path = textToAudio(text) +            audio_path = text_to_audio(text)              return fastapi.Response( -                getAudioFromFile(audio_path) if audio_path != "" else "", +                get_audio_from_file(audio_path) if audio_path != "" else "",                  media_type="audio/mpeg",              )          return {              "Content-Type": "application/json", -            "isOk": True if text != "" else False, +            "isOk": bool(text),              "result": text,          }      elif feat == "refs": -        refs = extractRefs(url) +        refs = extract_refs(url)          return {              "Content-Type": "application/json", -            "isOk": True if refs is not None else False, +            "isOk": bool(refs),              "result": refs,          } +    elif feat == "keyword": +        text = pdf_to_text(url) +        keywords = get_keywords_from_text(text) +        return { +            "Content-Type": "application/json", +            "isOk": bool(keywords), +            "result": keywords, +        } +    else: +        return { +            "Content-Type": "application/json", +            "isOk": False, +            "result": "unknown feature requested", +        } +# TODO- currently not working  @app.get("/mila/tika")  def pdf_to_audio_ep(url: str):      """Turns a pdf into an audiofile.""" -    audio_path = pdfToVoice() +    audio_path = pdf_to_voice()      return fastapi.Response( -        getAudioFromFile(audio_path) if audio_path != "" else "", +        get_audio_from_file(audio_path) if audio_path != "" else "",          media_type="audio/mpeg",      ) @@ -464,7 +488,7 @@ def extract_reqs_ep(url: str, sourcetype: str = "html"):      result = getRequirements(url, sourcetype)      return {          "Content-Type": "application/json", -        "isOK": True if result is not None else False, +        "isOK": bool(result),          "reqs": result,      } @@ -474,15 +498,15 @@ def wiki_search_ep(term: str, summary: str = "none", audio: bool = False):      """Search and summarizes from wikipedia."""      text = searchWikipedia(term, summary)      if audio: -        audio_path = textToAudio(text) +        audio_path = text_to_audio(text)          return fastapi.Response( -            getAudioFromFile(audio_path) if audio_path != "" else "", +            get_audio_from_file(audio_path) if audio_path != "" else "",              media_type="audio/mpeg",          )      else:          return {              "Content-Type": "application/json", -            "isOK": True if text != "" else False, +            "isOK": bool(text),              "audio": "",              "text": text,          } @@ -493,16 +517,16 @@ def summarize_ep(url: str, summary: str = "none", audio: bool = False):      """Summarize and turn the summary into audio."""      text = summarizeLinkToAudio(url, summary)      if audio: -        audio_path = textToAudio(text) +        audio_path = text_to_audio(text)          print(audio_path)          return fastapi.Response( -            getAudioFromFile(audio_path) if audio_path != "" else "", +            get_audio_from_file(audio_path) if audio_path != "" else "",              media_type="audio/mpeg",          )      else:          return {              "Content-Type": "application/json", -            "isOK": True if text != "" else False, +            "isOK": bool(text),              # "audio": "",              "text": text,          } @@ -513,16 +537,16 @@ def mila_ep(url: str, summary: str = "newspaper", audio: bool = False):      """Extract all the urls and then summarize and turn into audio."""      text = summarizeLinksToAudio(url, summary)      if audio: -        audio_path = textToAudio(text) +        audio_path = text_to_audio(text)          print(audio_path)          return fastapi.Response( -            getAudioFromFile(audio_path) if audio_path != "" else "", +            get_audio_from_file(audio_path) if audio_path != "" else "",              media_type="audio/mpeg",          )      else:          return {              "Content-Type": "application/json", -            "isOK": True if text != "" else False, +            "isOK": bool(text),              "audio": "",              "text": text,          } diff --git a/devourer/poetry.lock b/devourer/poetry.lock index ce2a8e6..be1bb56 100644 --- a/devourer/poetry.lock +++ b/devourer/poetry.lock @@ -663,14 +663,6 @@ optional = false  python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"  [[package]] -name = "python-magic-bin" -version = "0.4.14" -description = "File type identification using libmagic binary package" -category = "main" -optional = false -python-versions = "*" - -[[package]]  name = "pytz"  version = "2022.1"  description = "World timezone definitions, modern and historical" @@ -687,6 +679,17 @@ optional = false  python-versions = ">=3.6"  [[package]] +name = "rake-nltk" +version = "1.0.6" +description = "RAKE short for Rapid Automatic Keyword Extraction algorithm, is a domain independent keyword extraction algorithm which tries to determine key phrases in a body of text by analyzing the frequency of word appearance and its co-occurance with other words in the text." +category = "main" +optional = false +python-versions = ">=3.6,<4.0" + +[package.dependencies] +nltk = ">=3.6.2,<4.0.0" + +[[package]]  name = "readability-lxml"  version = "0.8.1"  description = "fast html to text parser (article readability tool) with python 3 support" @@ -1157,7 +1160,7 @@ testing = ["pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-flake8", "pytest-  [metadata]  lock-version = "1.1"  python-versions = "^3.8" -content-hash = "a740bd5805861994b28f7a187d06be052f26dd920355f6624955134e62cb6059" +content-hash = "24d72bba62f852cba715eb9f954a00454da5f045138088d365fc27d8de2a035f"  [metadata.files]  absl-py = [ @@ -1642,11 +1645,6 @@ python-magic = [      {file = "python-magic-0.4.25.tar.gz", hash = "sha256:21f5f542aa0330f5c8a64442528542f6215c8e18d2466b399b0d9d39356d83fc"},      {file = "python_magic-0.4.25-py2.py3-none-any.whl", hash = "sha256:1a2c81e8f395c744536369790bd75094665e9644110a6623bcc3bbea30f03973"},  ] -python-magic-bin = [ -    {file = "python_magic_bin-0.4.14-py2.py3-none-macosx_10_6_intel.whl", hash = "sha256:7b1743b3dbf16601d6eedf4e7c2c9a637901b0faaf24ad4df4d4527e7d8f66a4"}, -    {file = "python_magic_bin-0.4.14-py2.py3-none-win32.whl", hash = "sha256:34a788c03adde7608028203e2dbb208f1f62225ad91518787ae26d603ae68892"}, -    {file = "python_magic_bin-0.4.14-py2.py3-none-win_amd64.whl", hash = "sha256:90be6206ad31071a36065a2fc169c5afb5e0355cbe6030e87641c6c62edc2b69"}, -]  pytz = [      {file = "pytz-2022.1-py2.py3-none-any.whl", hash = "sha256:e68985985296d9a66a881eb3193b0906246245294a881e7c8afe623866ac6a5c"},      {file = "pytz-2022.1.tar.gz", hash = "sha256:1e760e2fe6a8163bc0b3d9a19c4f84342afa0a2affebfaa84b01b978a02ecaa7"}, @@ -1686,6 +1684,10 @@ pyyaml = [      {file = "PyYAML-6.0-cp39-cp39-win_amd64.whl", hash = "sha256:b3d267842bf12586ba6c734f89d1f5b871df0273157918b0ccefa29deb05c21c"},      {file = "PyYAML-6.0.tar.gz", hash = "sha256:68fb519c14306fec9720a2a5b45bc9f0c8d1b9c72adf45c37baedfcd949c35a2"},  ] +rake-nltk = [ +    {file = "rake-nltk-1.0.6.tar.gz", hash = "sha256:7813d680b2ce77b51cdac1757f801a87ff47682c9dbd2982aea3b66730346122"}, +    {file = "rake_nltk-1.0.6-py3-none-any.whl", hash = "sha256:1c1ffdb64cae8cb99d169d53a5ffa4635f1c4abd3a02c6e22d5d083136bdc5c1"}, +]  readability-lxml = [      {file = "readability-lxml-0.8.1.tar.gz", hash = "sha256:e51fea56b5909aaf886d307d48e79e096293255afa567b7d08bca94d25b1a4e1"},      {file = "readability_lxml-0.8.1-py3-none-any.whl", hash = "sha256:e0d366a21b1bd6cca17de71a4e6ea16fcfaa8b0a5b4004e39e2c7eff884e6305"}, diff --git a/devourer/pyproject.toml b/devourer/pyproject.toml index 2f79961..52b6b8c 100644 --- a/devourer/pyproject.toml +++ b/devourer/pyproject.toml @@ -19,6 +19,7 @@ transformers = "^4.11.2"  fastapi = "^0.70.0"  uvicorn = "^0.15.0"  refextract = "^1.1.4" +rake-nltk = "^1.0.6"  [tool.poetry.dev-dependencies] diff --git a/devourer/tests.sh b/devourer/tests.sh index e673acc..4ba4ea0 100755 --- a/devourer/tests.sh +++ b/devourer/tests.sh @@ -1,8 +1,12 @@  #!/usr/bin/env sh  curl -k -X GET "https://localhost:19019/mila/summ?url=https://dilipkumar.medium.com/standalone-mongodb-on-kubernetes-cluster-19e7b5896b27&summary=newspaper&audio=true" +  curl -k -X GET "https://localhost:19019/mila/wiki?term=iommu&summary=none&audio=false" +  curl -k -X GET "https://localhost:19019/mila/reqs?url=https://www.ietf.org/rfc/rfc2865.txt&sourcetype=text" +  curl -k -X GET "https://localhost:19019/mila/pdf?feat=&url=https://www.rroij.com/open-access/mutation-testing-a-review-33-36.pdf"  curl -k -X GET "https://localhost:19019/mila/pdf?feat=refs&url=https://www.rroij.com/open-access/mutation-testing-a-review-33-36.pdf" +curl -k -X GET "https://localhost:19019/mila/pdf?feat=keyword&url=https://www.rroij.com/open-access/mutation-testing-a-review-33-36.pdf&summarize=true"  curl -k -X GET "https://localhost:19019/mila/pdf?feat=&url=https://www.rroij.com/open-access/mutation-testing-a-review-33-36.pdf&summarize=true"  | 
