diff options
author | terminaldweller <thabogre@gmail.com> | 2021-07-29 16:58:24 +0000 |
---|---|---|
committer | terminaldweller <thabogre@gmail.com> | 2021-07-29 16:58:24 +0000 |
commit | 5f67821f49a8e3c4573c4d3a8af977ff61dd51f3 (patch) | |
tree | 8205ae4a8e527d0d0e34a0821471f106ab7fc4a6 | |
parent | no need for externally getting the links. devourer can do that on its own now... (diff) | |
download | devourer-5f67821f49a8e3c4573c4d3a8af977ff61dd51f3.tar.gz devourer-5f67821f49a8e3c4573c4d3a8af977ff61dd51f3.zip |
added config for newspaper. added gitpod configs.
-rw-r--r-- | .gitpod.Dockerfile | 27 | ||||
-rw-r--r-- | .gitpod.yml | 5 | ||||
-rwxr-xr-x | main.py | 17 | ||||
-rwxr-xr-x | run.sh | 2 |
4 files changed, 46 insertions, 5 deletions
diff --git a/.gitpod.Dockerfile b/.gitpod.Dockerfile new file mode 100644 index 0000000..2f126fa --- /dev/null +++ b/.gitpod.Dockerfile @@ -0,0 +1,27 @@ +FROM gitpod/workspace-full +FROM python:3.8.11-slim as python-base +ENV PYTHONUNBUFFERED=1 \ + PYTHONDONTWRITEBYTECODE=1 \ + PIP_NO_CACHE_DIR=off \ + PIP_DISABLE_PIP_VERSION_CHECK=on \ + PIP_DEFAULT_TIMEOUT=100 \ + POETRY_HOME="/poetry" \ + POETRY_VIRTUALENVS_IN_PROJECT=true \ + POETRY_NO_INTERACTION=1 \ + PYSETUP_PATH="/devourer" \ + VENV_PATH="/devourer/.venv" +ENV PATH="$POETRY_HOME/bin:$VENV_PATH/bin:$PATH" + +FROM python-base as builder-base +ENV POETRY_VERSION=1.0.0 +RUN apt update && apt install -y --no-install-recommends curl build-essential +RUN curl -sSL https://raw.githubusercontent.com/sdispater/poetry/master/get-poetry.py | python +WORKDIR $PYSETUP_PATH +COPY ./pyproject.toml ./ +RUN poetry install --no-dev + +FROM python-base as production +ENV FASTAPI_ENV=production +COPY --from=builder-base $VENV_PATH $VENV_PATH +COPY ./main.py $PYSETUP_PATH/main.py +ENTRYPOINT $PYSETUP_PATH/main.py diff --git a/.gitpod.yml b/.gitpod.yml new file mode 100644 index 0000000..8d6f604 --- /dev/null +++ b/.gitpod.yml @@ -0,0 +1,5 @@ +image: + file: .gitpod.Dockerfile + +tasks: + - init: poetry install --no-dev @@ -3,7 +3,7 @@ import argparse import logging -from newspaper import Article, build +from newspaper import Article, build, Config from bs4 import BeautifulSoup from contextlib import closing from requests import get @@ -24,7 +24,7 @@ class Argparser(object): # TODO-maybe actually really do some logging def logError(err): - print(err) + logging.exception(err) def isAGoodResponse(resp): @@ -66,12 +66,18 @@ def getURLS(source): return result +def configNews(config): + config.fetch_images = False + config.keep_article_html = True + config.memoize_articles = False + config.browser_user_agent = "Chrome/91.0.4464.5" + + def main(): argparser = Argparser() + config = Config() + configNews(config) urls = getURLS(argparser.args.source) - # import sys - # print(urls) - # sys.exit(0) for url in urls: parser = build(url) for article in parser.articles: @@ -79,6 +85,7 @@ def main(): try: a.download() a.parse() + # print(a.html) print(a.text) except Exception as e: logging.exception(e) @@ -4,4 +4,6 @@ set -x # sniff --src https://github.com/coinpride/CryptoList --url | ./main.py > out2.txt ./main.py --source https://github.com/coinpride/CryptoList > out.txt +# ./main.py --source https://techurls.com/ > out.txt # cat out2.txt| pico2wave --wave=out2.wav +# ./main.py --source http://blog.terminaldweller.com > out.txt |