aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorterminaldweller <thabogre@gmail.com>2021-10-06 22:30:13 +0000
committerterminaldweller <thabogre@gmail.com>2021-10-06 22:30:13 +0000
commit45f45cd9f76af87fc848551a555f02c49dc7f75b (patch)
tree04467c054964fe231239b57c883522a058b8f25b
parentWIP (diff)
downloaddevourer-45f45cd9f76af87fc848551a555f02c49dc7f75b.tar.gz
devourer-45f45cd9f76af87fc848551a555f02c49dc7f75b.zip
WIP
-rwxr-xr-xdevourer.py (renamed from main.py)110
-rw-r--r--pyproject.toml3
2 files changed, 93 insertions, 20 deletions
diff --git a/main.py b/devourer.py
index 8176293..863a8b0 100755
--- a/main.py
+++ b/devourer.py
@@ -20,6 +20,9 @@ from gtts import gTTS
from datetime import datetime as time
+WIKIPEDIA_SEARCH_URL = "https://en.wikipedia.org/w/api.php"
+
+
class Argparser(object):
def __init__(self):
parser = argparse.ArgumentParser()
@@ -44,6 +47,11 @@ class Argparser(object):
default=False, help="convert pdf to mp3. \
source should be the path to a pdf file and\
out should be the path to the mp3 output file")
+ parser.add_argument("--summary", type=str, default="newspaper",
+ help="which summary type to use. currently we \
+ have newspaper, bart and none.")
+ parser.add_argument("--search", type=str,
+ default="", help="the search query")
self.args = parser.parse_args()
@@ -61,7 +69,7 @@ def isAGoodResponse(resp: Response) -> bool:
def simpleGet(url: str) -> bytes:
- """issues a simple get request to download a website"""
+ """issues a simple get request"""
try:
with closing(get(url, stream=True)) as resp:
if isAGoodResponse(resp):
@@ -73,6 +81,19 @@ def simpleGet(url: str) -> bytes:
return None
+def getWithParams(url: str, params: dict) -> dict:
+ """issues a get requesti with params"""
+ try:
+ with closing(get(url, params=params, stream=True)) as resp:
+ if isAGoodResponse(resp):
+ return resp.json()
+ else:
+ return None
+ except RequestException as e:
+ logError("Error during requests to {0} : {1}".format(url, str(e)))
+ return None
+
+
def getURLS(source: str) -> dict:
"""extracts the urls from a website"""
result = dict()
@@ -103,8 +124,9 @@ def configNews(config: Config) -> None:
config.browser_user_agent = "Chrome/91.0.4464.5"
-def call_from_shell_list(command_list):
- # should probably deprecate this at some point
+def call_from_shell_list(command_list: list):
+ """run a shell command given a list of command/arguments"""
+ # TODO-should probably deprecate this at some point
if sys.version_info < (3, 7):
return subprocess.run(command_list, stdout=subprocess.PIPE)
else:
@@ -132,6 +154,8 @@ def pdfToVoice(argparser: Argparser) -> None:
def extractRequirements(textBody: str) -> list:
+ """extract the sentences containing the keywords
+ that denote a requirement"""
result = []
REQ_KEYWORDS = ["shall", "should", "must", "may", "can", "could"]
nltk.download("punkt")
@@ -143,6 +167,28 @@ def extractRequirements(textBody: str) -> list:
return result
+def summarizeText(text: str) -> str:
+ """summarize the given text using bart"""
+ from transformers import BartTokenizer, BartForConditionalGeneration
+ model = BartForConditionalGeneration.from_pretrained(
+ 'facebook/bart-large-cnn')
+ tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
+ inputs = tokenizer([text],
+ max_length=1024, return_tensors='pt')
+ summary_ids = model.generate(
+ inputs['input_ids'], num_beams=4, max_length=5, early_stopping=True)
+ return([tokenizer.decode(g,
+ skip_special_tokens=True,
+ clean_up_tokenization_spaces=False)
+ for g in summary_ids])
+
+
+def textToAudio(text: str) -> None:
+ """transform the given text into audio"""
+ tts = gTTS(text)
+ tts.save(time.today().strftime("%b-%d-%Y-%M-%S-%f")+".mp3")
+
+
def singleLinkMode(argparser: Argparser) -> dict:
"""runs the single-link main function"""
if argparser.args.sourcetype == "html":
@@ -162,35 +208,59 @@ def singleLinkMode(argparser: Argparser) -> dict:
extractRequirements(bytesText.decode("utf-8"))
-def multiLinkMode(argparser: Argparser) -> None:
- """run the multi-link main function"""
+def summarizeLinkToAudio(argparser: Argparser) -> None:
+ """summarizes the text inside a given url into audio"""
+ try:
+ article = Article(argparser.args.source)
+ article.download()
+ article.parse()
+ if argparser.args.summary == "newspaper":
+ article.nlp()
+ textToAudio(article.summary)
+ elif argparser.args.summary == "none":
+ textToAudio(article.text)
+ elif argparser.args.summary == "bart":
+ textToAudio(summarizeText(article.text))
+ else:
+ print("invalid option for summry type.")
+ except Exception as e:
+ logging.exception(e)
+
+
+def summarizeLinksToAudio(argparser: Argparser) -> None:
+ """summarize a list of urls into audio files"""
config = Config()
configNews(config)
urls = getURLS(argparser.args.source)
for url in urls:
- parser = build(url)
- for article in parser.articles:
- a = Article(article.url)
- try:
- a.download()
- a.parse()
- doc = Document(a.html)
- print(doc.summary())
- if a.text != '':
- tts = gTTS(a.text)
- tts.save(time.today().strftime("%b-%d-%Y-%M-%S-%f")+".mp3")
- except Exception as e:
- logging.exception(e)
+ summarizeLinkToAudio(url)
+
+
+def searchWikipedia(argparser: Argparser) -> str:
+ """search wikipedia for a string and return the url"""
+ searchParmas = {
+ "action": "opensearch",
+ "namespace": "0",
+ "search": argparser.args.search,
+ "limit": "10",
+ "format": "json"
+ }
+ res = getWithParams(WIKIPEDIA_SEARCH_URL, searchParmas)
+ print(res)
+ argparser.args.source = res[3][0]
+ summarizeLinkToAudio(argparser)
def main() -> None:
argparser = Argparser()
if argparser.args.singlelink:
- singleLinkMode(argparser)
+ summarizeLinkToAudio(argparser)
elif argparser.args.multilink:
- multiLinkMode(argparser)
+ summarizeLinksToAudio(argparser)
elif argparser.args.pdftomp3:
pdfToVoice(argparser)
+ elif argparser.args.search:
+ searchWikipedia(argparser)
else:
pass
diff --git a/pyproject.toml b/pyproject.toml
index 1e09611..ca26b0f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -14,6 +14,9 @@ gtts = "^2.2.3"
tika = "^1.24"
docker = "^5.0.2"
nltk = "^3.6.3"
+tensorflow = "^2.6.0"
+torch = "^1.9.1"
+transformers = "^4.11.2"
[tool.poetry.dev-dependencies]