From 5d513298cdb97e8a98b12b5fa9d02331ad7d5008 Mon Sep 17 00:00:00 2001
From: terminaldweller <thabogre@gmail.com>
Date: Tue, 14 Jun 2022 20:05:25 +0430
Subject: added a summarization that works

---
 devourer/devourer.py | 75 ++++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 58 insertions(+), 17 deletions(-)

diff --git a/devourer/devourer.py b/devourer/devourer.py
index 230adf9..29d4506 100644
--- a/devourer/devourer.py
+++ b/devourer/devourer.py
@@ -10,15 +10,15 @@ import string
 import tempfile
 import typing
 
-import bs4
+import bs4  # type:ignore
 import fastapi
-import gtts
-import newspaper
-import nltk
-import readability
-import refextract
+import gtts  # type:ignore
+import newspaper  # type:ignore
+import nltk  # type:ignore
+import readability  # type:ignore
+import refextract  # type:ignore
 import requests
-import tika
+import tika  # type:ignore
 import transformers
 from tika import parser as tparser
 
@@ -150,6 +150,7 @@ def extractRequirements(textBody: str) -> list:
 
 
 def extractRefs(url: str) -> list:
+    """Extract the references from an article."""
     refs = list()
     try:
         refs = refextract.extract_references_from_url(url)
@@ -226,7 +227,44 @@ def summarizeText(text: str) -> str:
 
 
 def summarizeText_v2(text: str) -> str:
-    pass
+    """Text summarization using nltk."""
+    stop_words = set(nltk.corpus.stopwords.words("english"))
+    words = nltk.tokenize.word_tokenize(text)
+    freq_table: typing.Dict[str, int] = {}
+
+    for word in words:
+        word = word.lower()
+        if word in stop_words:
+            continue
+        if word in freq_table:
+            freq_table[word] += 1
+        else:
+            freq_table[word] = 1
+
+    sentences = nltk.tokenize.sent_tokenize(text)
+    sentence_value: typing.Dict[str, int] = {}
+
+    for sentence in sentences:
+        for word, freq in freq_table.items():
+            if word in sentence.lower():
+                if sentence in sentence_value:
+                    sentence_value[sentence] += freq
+                else:
+                    sentence_value[sentence] = freq
+
+    sum_values: float = 0
+    for sentence, value in sentence_value.items():
+        sum_values += value
+
+    average: float = int(sum_values / len(sentence_value))
+    summary: str = ""
+    for sentence in sentences:
+        if (sentence in sentence_value) and (
+            sentence_value[sentence] > (1.2 * average)
+        ):
+            summary += " " + sentence
+
+    return summary
 
 
 def textToAudio(text: str) -> str:
@@ -317,7 +355,7 @@ def summarizeLinksToAudio(url: str, summary: str) -> str:
 
 def searchWikipedia(search_term: str, summary: str) -> str:
     """Search wikipedia for a string and return the url.
-    reference: https://www.mediawiki.org/wiki/API:Opensearch
+    reference: https://www.mediawiki.org/wiki/API:Opensearch.
     """
     result = str()
     try:
@@ -362,6 +400,7 @@ def getSentiments(detailed: bool) -> list:
 app = fastapi.FastAPI()
 
 nltk.download("punkt")
+nltk.download("stopwords")
 
 
 # https://cheatsheetseries.owasp.org/cheatsheets/REST_Security_Cheat_Sheet.html
@@ -369,7 +408,7 @@ nltk.download("punkt")
 async def addSecureHeaders(
     request: fastapi.Request, call_next
 ) -> fastapi.Response:
-    """adds security headers proposed by OWASP."""
+    """Adds security headers proposed by OWASP."""
     response = await call_next(request)
     response.headers["Cache-Control"] = "no-store"
     response.headers["Content-Security-Policy"] = "default-src-https"
@@ -384,11 +423,11 @@ async def addSecureHeaders(
 def pdf_ep(
     url: str, feat: str = "", audio: bool = False, summarize: bool = False
 ):
-    """the pdf manupulation endpoint."""
+    """The pdf manupulation endpoint."""
     if feat == "":
         text = pdfToText(url)
         if summarize:
-            text = summarizeText(text)
+            text = summarizeText_v2(text)
         if audio:
             audio_path = textToAudio(text)
             return fastapi.Response(
@@ -411,7 +450,7 @@ def pdf_ep(
 
 @app.get("/mila/tika")
 def pdf_to_audio_ep(url: str):
-    """turns a pdf into an audiofile."""
+    """Turns a pdf into an audiofile."""
     audio_path = pdfToVoice()
     return fastapi.Response(
         getAudioFromFile(audio_path) if audio_path != "" else "",
@@ -421,7 +460,7 @@ def pdf_to_audio_ep(url: str):
 
 @app.get("/mila/reqs")
 def extract_reqs_ep(url: str, sourcetype: str = "html"):
-    """extracts the requirements from a given url."""
+    """Extracts the requirements from a given url."""
     result = getRequirements(url, sourcetype)
     return {
         "Content-Type": "application/json",
@@ -432,7 +471,7 @@ def extract_reqs_ep(url: str, sourcetype: str = "html"):
 
 @app.get("/mila/wiki")
 def wiki_search_ep(term: str, summary: str = "none", audio: bool = False):
-    """search and summarizes from wikipedia."""
+    """Search and summarizes from wikipedia."""
     text = searchWikipedia(term, summary)
     if audio:
         audio_path = textToAudio(text)
@@ -451,7 +490,7 @@ def wiki_search_ep(term: str, summary: str = "none", audio: bool = False):
 
 @app.get("/mila/summ")
 def summarize_ep(url: str, summary: str = "none", audio: bool = False):
-    """summarize and turn the summary into audio."""
+    """Summarize and turn the summary into audio."""
     text = summarizeLinkToAudio(url, summary)
     if audio:
         audio_path = textToAudio(text)
@@ -471,7 +510,7 @@ def summarize_ep(url: str, summary: str = "none", audio: bool = False):
 
 @app.get("/mila/mila")
 def mila_ep(url: str, summary: str = "newspaper", audio: bool = False):
-    """extract all the urls and then summarize and turn into audio."""
+    """Extract all the urls and then summarize and turn into audio."""
     text = summarizeLinksToAudio(url, summary)
     if audio:
         audio_path = textToAudio(text)
@@ -491,11 +530,13 @@ def mila_ep(url: str, summary: str = "newspaper", audio: bool = False):
 
 @app.get("/mila/health")
 def health_ep():
+    """The health endpoint."""
     return {"Content-Type": "application/json", "isOK": True}
 
 
 @app.get("/mila/robots.txt")
 def robots_ep():
+    """The robots endpoint."""
     return {
         "Content-Type": "apllication/json",
         "User-Agents": "*",
-- 
cgit v1.2.3