diff options
-rw-r--r-- | Dockerfile | 27 | ||||
-rwxr-xr-x | main.py | 65 | ||||
-rw-r--r-- | pyproject.toml | 17 | ||||
-rwxr-xr-x | run.sh | 8 |
4 files changed, 108 insertions, 9 deletions
diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..558380c --- /dev/null +++ b/Dockerfile @@ -0,0 +1,27 @@ +FROM python:3.8.11-slim as python-base +ENV PYTHONUNBUFFERED=1 \ + PYTHONDONTWRITEBYTECODE=1 \ + PIP_NO_CACHE_DIR=off \ + PIP_DISABLE_PIP_VERSION_CHECK=on \ + PIP_DEFAULT_TIMEOUT=100 \ + POETRY_HOME="/poetry" \ + POETRY_VIRTUALENVS_IN_PROJECT=true \ + POETRY_NO_INTERACTION=1 \ + PYSETUP_PATH="/devourer" \ + VENV_PATH="/devourer/.venv" +ENV PATH="$POETRY_HOME/bin:$VENV_PATH/bin:$PATH" + +FROM python-base as builder-base +ENV POETRY_VERSION=1.0.0 +RUN apt update && apt install -y --no-install-recommends curl build-essential +RUN curl -sSL https://raw.githubusercontent.com/sdispater/poetry/master/get-poetry.py | python +WORKDIR $PYSETUP_PATH +COPY ./pyproject.toml ./ +RUN poetry install --no-dev + +FROM python-base as production +ENV FASTAPI_ENV=production +COPY --from=builder-base $VENV_PATH $VENV_PATH +COPY ./main.py $PYSETUP_PATH/main.py +ENTRYPOINT $PYSETUP_PATH/main.py +# CMD ["--source", "https://github.com/coinpride/CryptoList"] @@ -3,24 +3,75 @@ import argparse import logging -import traceback from newspaper import Article, build -import fileinput +from bs4 import BeautifulSoup +from contextlib import closing +from requests import get +from requests.exceptions import RequestException +from re import findall class Argparser(object): def __init__(self): parser = argparse.ArgumentParser() - parser.add_argument("--string", type=str, help="string") + parser.add_argument( + "--source", + type=str, help="the url where the urls to be extracted reside") parser.add_argument("--bool", action="store_true", help="bool", default=False) - parser.add_argument("--dbg", action="store_true", - help="debug", default=False) self.args = parser.parse_args() +# TODO-maybe actually really do some logging +def logError(err): + print(err) + + +def isAGoodResponse(resp): + content_type = resp.headers['Content-Type'].lower() + return (resp.status_code == 200 and + content_type is not None and content_type.find("html") > -1) + + +def simpleGet(url): + try: + with closing(get(url, stream=True)) as resp: + if isAGoodResponse(resp): + return resp.content + else: + return None + except RequestException as e: + logError("Error during requests to {0} : {1}".format(url, str(e))) + return None + + +def getURLS(source): + result = dict() + raw_ml = simpleGet(source) + ml = BeautifulSoup(raw_ml, "lxml") + ml_str = repr(ml) + tmp = open("/tmp/riecher", "w") + tmp.write(ml_str) + tmp.close() + tmp = open("/tmp/riecher", "r") + dump_list = [] + for line in tmp: + dummy = findall( + 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*(),]|' + r'(?:%[0-9a-fA-F][0-9a-fA-F]))+', line) + dump_list += dummy + for elem in dump_list: + result[elem] = elem + tmp.close() + return result + + def main(): - urls = (line for line in fileinput.input()) + argparser = Argparser() + urls = getURLS(argparser.args.source) + # import sys + # print(urls) + # sys.exit(0) for url in urls: parser = build(url) for article in parser.articles: @@ -30,7 +81,7 @@ def main(): a.parse() print(a.text) except Exception as e: - logging.error(traceback.format_exc(e)) + logging.exception(e) if __name__ == "__main__": diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..f91a38f --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,17 @@ +[tool.poetry] +name = "devourer" +version = "0.1.0" +description = "" +authors = ["terminaldweller <thabogre@gmail.com>"] +license = "GPL3.0" + +[tool.poetry.dependencies] +python = "^3.8" +newspaper3k = "^0.2.8" +beautifulsoup4 = "^4.9.3" + +[tool.poetry.dev-dependencies] + +[build-system] +requires = ["poetry-core>=1.0.0"] +build-backend = "poetry.core.masonry.api" @@ -1,3 +1,7 @@ -#!/usr/bin/env zsh +#!/usr/bin/env sh +set -e +set -x -sniff --src https://github.com/coinpride/CryptoList --url | ./main.py > out2.txt +# sniff --src https://github.com/coinpride/CryptoList --url | ./main.py > out2.txt +./main.py --source https://github.com/coinpride/CryptoList > out.txt +# cat out2.txt| pico2wave --wave=out2.wav |