aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorterminaldweller <thabogre@gmail.com>2021-07-29 11:41:20 +0000
committerterminaldweller <thabogre@gmail.com>2021-07-29 11:41:20 +0000
commitfab529b1c0bc71fa4c904e7a251038426f81369f (patch)
treea75317cae4dfab07e5042067d1246516f89a44c3
parentfirst commit (diff)
downloaddevourer-fab529b1c0bc71fa4c904e7a251038426f81369f.tar.gz
devourer-fab529b1c0bc71fa4c904e7a251038426f81369f.zip
no need for externally getting the links. devourer can do that on its own now. using poetry now. added a dockerfile.
-rw-r--r--Dockerfile27
-rwxr-xr-xmain.py65
-rw-r--r--pyproject.toml17
-rwxr-xr-xrun.sh8
4 files changed, 108 insertions, 9 deletions
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000..558380c
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,27 @@
+FROM python:3.8.11-slim as python-base
+ENV PYTHONUNBUFFERED=1 \
+ PYTHONDONTWRITEBYTECODE=1 \
+ PIP_NO_CACHE_DIR=off \
+ PIP_DISABLE_PIP_VERSION_CHECK=on \
+ PIP_DEFAULT_TIMEOUT=100 \
+ POETRY_HOME="/poetry" \
+ POETRY_VIRTUALENVS_IN_PROJECT=true \
+ POETRY_NO_INTERACTION=1 \
+ PYSETUP_PATH="/devourer" \
+ VENV_PATH="/devourer/.venv"
+ENV PATH="$POETRY_HOME/bin:$VENV_PATH/bin:$PATH"
+
+FROM python-base as builder-base
+ENV POETRY_VERSION=1.0.0
+RUN apt update && apt install -y --no-install-recommends curl build-essential
+RUN curl -sSL https://raw.githubusercontent.com/sdispater/poetry/master/get-poetry.py | python
+WORKDIR $PYSETUP_PATH
+COPY ./pyproject.toml ./
+RUN poetry install --no-dev
+
+FROM python-base as production
+ENV FASTAPI_ENV=production
+COPY --from=builder-base $VENV_PATH $VENV_PATH
+COPY ./main.py $PYSETUP_PATH/main.py
+ENTRYPOINT $PYSETUP_PATH/main.py
+# CMD ["--source", "https://github.com/coinpride/CryptoList"]
diff --git a/main.py b/main.py
index c9a1af6..00ce6d1 100755
--- a/main.py
+++ b/main.py
@@ -3,24 +3,75 @@
import argparse
import logging
-import traceback
from newspaper import Article, build
-import fileinput
+from bs4 import BeautifulSoup
+from contextlib import closing
+from requests import get
+from requests.exceptions import RequestException
+from re import findall
class Argparser(object):
def __init__(self):
parser = argparse.ArgumentParser()
- parser.add_argument("--string", type=str, help="string")
+ parser.add_argument(
+ "--source",
+ type=str, help="the url where the urls to be extracted reside")
parser.add_argument("--bool", action="store_true",
help="bool", default=False)
- parser.add_argument("--dbg", action="store_true",
- help="debug", default=False)
self.args = parser.parse_args()
+# TODO-maybe actually really do some logging
+def logError(err):
+ print(err)
+
+
+def isAGoodResponse(resp):
+ content_type = resp.headers['Content-Type'].lower()
+ return (resp.status_code == 200 and
+ content_type is not None and content_type.find("html") > -1)
+
+
+def simpleGet(url):
+ try:
+ with closing(get(url, stream=True)) as resp:
+ if isAGoodResponse(resp):
+ return resp.content
+ else:
+ return None
+ except RequestException as e:
+ logError("Error during requests to {0} : {1}".format(url, str(e)))
+ return None
+
+
+def getURLS(source):
+ result = dict()
+ raw_ml = simpleGet(source)
+ ml = BeautifulSoup(raw_ml, "lxml")
+ ml_str = repr(ml)
+ tmp = open("/tmp/riecher", "w")
+ tmp.write(ml_str)
+ tmp.close()
+ tmp = open("/tmp/riecher", "r")
+ dump_list = []
+ for line in tmp:
+ dummy = findall(
+ 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*(),]|'
+ r'(?:%[0-9a-fA-F][0-9a-fA-F]))+', line)
+ dump_list += dummy
+ for elem in dump_list:
+ result[elem] = elem
+ tmp.close()
+ return result
+
+
def main():
- urls = (line for line in fileinput.input())
+ argparser = Argparser()
+ urls = getURLS(argparser.args.source)
+ # import sys
+ # print(urls)
+ # sys.exit(0)
for url in urls:
parser = build(url)
for article in parser.articles:
@@ -30,7 +81,7 @@ def main():
a.parse()
print(a.text)
except Exception as e:
- logging.error(traceback.format_exc(e))
+ logging.exception(e)
if __name__ == "__main__":
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000..f91a38f
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,17 @@
+[tool.poetry]
+name = "devourer"
+version = "0.1.0"
+description = ""
+authors = ["terminaldweller <thabogre@gmail.com>"]
+license = "GPL3.0"
+
+[tool.poetry.dependencies]
+python = "^3.8"
+newspaper3k = "^0.2.8"
+beautifulsoup4 = "^4.9.3"
+
+[tool.poetry.dev-dependencies]
+
+[build-system]
+requires = ["poetry-core>=1.0.0"]
+build-backend = "poetry.core.masonry.api"
diff --git a/run.sh b/run.sh
index 6037b8d..f3f73d7 100755
--- a/run.sh
+++ b/run.sh
@@ -1,3 +1,7 @@
-#!/usr/bin/env zsh
+#!/usr/bin/env sh
+set -e
+set -x
-sniff --src https://github.com/coinpride/CryptoList --url | ./main.py > out2.txt
+# sniff --src https://github.com/coinpride/CryptoList --url | ./main.py > out2.txt
+./main.py --source https://github.com/coinpride/CryptoList > out.txt
+# cat out2.txt| pico2wave --wave=out2.wav