aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorterminaldweller <thabogre@gmail.com>2021-09-24 09:30:08 +0000
committerterminaldweller <thabogre@gmail.com>2021-09-24 09:30:08 +0000
commit96b963fc1ab6e6d20581908e19e36cd01f2cf47b (patch)
treee8c014f880e2d0a0f3419c252cfb56ad5b3d99c2
parentadded typehints and comments (diff)
downloaddevourer-96b963fc1ab6e6d20581908e19e36cd01f2cf47b.tar.gz
devourer-96b963fc1ab6e6d20581908e19e36cd01f2cf47b.zip
WIP
-rw-r--r--Dockerfile2
-rwxr-xr-xmain.py111
-rw-r--r--pyproject.toml3
3 files changed, 106 insertions, 10 deletions
diff --git a/Dockerfile b/Dockerfile
index 558380c..7e5641c 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -23,5 +23,5 @@ FROM python-base as production
ENV FASTAPI_ENV=production
COPY --from=builder-base $VENV_PATH $VENV_PATH
COPY ./main.py $PYSETUP_PATH/main.py
-ENTRYPOINT $PYSETUP_PATH/main.py
+ENTRYPOINT ["$PYSETUP_PATH/main.py"]
# CMD ["--source", "https://github.com/coinpride/CryptoList"]
diff --git a/main.py b/main.py
index ef89ca1..8176293 100755
--- a/main.py
+++ b/main.py
@@ -3,6 +3,12 @@
import argparse
import logging
+import subprocess
+import sys
+import tika
+import docker
+import os
+import nltk
from newspaper import Article, build, Config
from bs4 import BeautifulSoup
from contextlib import closing
@@ -17,15 +23,31 @@ from datetime import datetime as time
class Argparser(object):
def __init__(self):
parser = argparse.ArgumentParser()
- parser.add_argument(
- "--source",
- type=str, help="the url where the urls to be extracted reside")
- parser.add_argument("--bool", action="store_true",
- help="bool", default=False)
+ parser.add_argument("--source",
+ type=str, help="the url where the \
+ urls to be extracted reside")
+ parser.add_argument("--out", type=str,
+ help="the output file", default="")
+ parser.add_argument("--singlelink", action="store_true",
+ help="whether the app should work in single-link \
+ meaning only one page's contents will be used \
+ mode", default=False)
+ parser.add_argument("--multilink", action="store_true",
+ help="whether the app should work in multi-link \
+ mode meaning the srouce contians a list of links \
+ rather than being the actual source itself",
+ default=False)
+ parser.add_argument("--sourcetype", type=str,
+ help="determines the type of the \
+ source.html,text,...")
+ parser.add_argument("--pdftomp3", action="store_true",
+ default=False, help="convert pdf to mp3. \
+ source should be the path to a pdf file and\
+ out should be the path to the mp3 output file")
self.args = parser.parse_args()
-# TODO-maybe actually really do some logging
+# FIXME-maybe actually really do some logging
def logError(err: RequestException) -> None:
"""logs the errors"""
logging.exception(err)
@@ -35,7 +57,7 @@ def isAGoodResponse(resp: Response) -> bool:
"""checks whether the get we sent got a 200 response"""
content_type = resp.headers['Content-Type'].lower()
return (resp.status_code == 200 and
- content_type is not None and content_type.find("html") > -1)
+ content_type is not None)
def simpleGet(url: str) -> bytes:
@@ -81,8 +103,67 @@ def configNews(config: Config) -> None:
config.browser_user_agent = "Chrome/91.0.4464.5"
-def main() -> None:
- argparser = Argparser()
+def call_from_shell_list(command_list):
+ # should probably deprecate this at some point
+ if sys.version_info < (3, 7):
+ return subprocess.run(command_list, stdout=subprocess.PIPE)
+ else:
+ return subprocess.run(command_list, capture_output=True)
+
+
+def pdfToVoice(argparser: Argparser) -> None:
+ """main function for converting a pdf to an mp3"""
+ TIKA_SERVER_ENDPOINT = "127.0.0.1:9977"
+ os.environ["TIKA_SERVER_ENDPOINT"] = TIKA_SERVER_ENDPOINT
+ dockerClient = docker.from_env()
+ container = dockerClient.containers.run("apache/tika:2.0.0", detach=True,
+ ports={TIKA_SERVER_ENDPOINT:
+ "9998"})
+ while True:
+ resp = get("http://127.0.0.1:9977")
+ if resp.status_code == 200:
+ break
+ time.sleep(.5)
+ rawText = tika.parser.from_file()
+ tts = gTTS(rawText['content'])
+ tts.save(argparser.args.out)
+ container.stop()
+ dockerClient.close()
+
+
+def extractRequirements(textBody: str) -> list:
+ result = []
+ REQ_KEYWORDS = ["shall", "should", "must", "may", "can", "could"]
+ nltk.download("punkt")
+ sentences = nltk.sent_tokenize(textBody)
+ for sentence in sentences:
+ for keyword in REQ_KEYWORDS:
+ if sentence.find(keyword) >= 0:
+ result.append(sentence)
+ return result
+
+
+def singleLinkMode(argparser: Argparser) -> dict:
+ """runs the single-link main function"""
+ if argparser.args.sourcetype == "html":
+ parser = build(argparser.args.source)
+ for article in parser.articles:
+ a = Article(article.url)
+ try:
+ a.download()
+ a.parse()
+ doc = Document(a.html)
+ print(doc.summary())
+ extractRequirements(doc.summary())
+ except Exception as e:
+ logging.exception(e)
+ elif argparser.args.sourcetype == "text":
+ bytesText = simpleGet(argparser.args.source)
+ extractRequirements(bytesText.decode("utf-8"))
+
+
+def multiLinkMode(argparser: Argparser) -> None:
+ """run the multi-link main function"""
config = Config()
configNews(config)
urls = getURLS(argparser.args.source)
@@ -102,5 +183,17 @@ def main() -> None:
logging.exception(e)
+def main() -> None:
+ argparser = Argparser()
+ if argparser.args.singlelink:
+ singleLinkMode(argparser)
+ elif argparser.args.multilink:
+ multiLinkMode(argparser)
+ elif argparser.args.pdftomp3:
+ pdfToVoice(argparser)
+ else:
+ pass
+
+
if __name__ == "__main__":
main()
diff --git a/pyproject.toml b/pyproject.toml
index 8eaddf2..1e09611 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -11,6 +11,9 @@ newspaper3k = "^0.2.8"
beautifulsoup4 = "^4.9.3"
readability-lxml = "^0.8.1"
gtts = "^2.2.3"
+tika = "^1.24"
+docker = "^5.0.2"
+nltk = "^3.6.3"
[tool.poetry.dev-dependencies]