From 5f67821f49a8e3c4573c4d3a8af977ff61dd51f3 Mon Sep 17 00:00:00 2001 From: terminaldweller Date: Thu, 29 Jul 2021 21:28:24 +0430 Subject: added config for newspaper. added gitpod configs. --- .gitpod.Dockerfile | 27 +++++++++++++++++++++++++++ .gitpod.yml | 5 +++++ main.py | 17 ++++++++++++----- run.sh | 2 ++ 4 files changed, 46 insertions(+), 5 deletions(-) create mode 100644 .gitpod.Dockerfile create mode 100644 .gitpod.yml diff --git a/.gitpod.Dockerfile b/.gitpod.Dockerfile new file mode 100644 index 0000000..2f126fa --- /dev/null +++ b/.gitpod.Dockerfile @@ -0,0 +1,27 @@ +FROM gitpod/workspace-full +FROM python:3.8.11-slim as python-base +ENV PYTHONUNBUFFERED=1 \ + PYTHONDONTWRITEBYTECODE=1 \ + PIP_NO_CACHE_DIR=off \ + PIP_DISABLE_PIP_VERSION_CHECK=on \ + PIP_DEFAULT_TIMEOUT=100 \ + POETRY_HOME="/poetry" \ + POETRY_VIRTUALENVS_IN_PROJECT=true \ + POETRY_NO_INTERACTION=1 \ + PYSETUP_PATH="/devourer" \ + VENV_PATH="/devourer/.venv" +ENV PATH="$POETRY_HOME/bin:$VENV_PATH/bin:$PATH" + +FROM python-base as builder-base +ENV POETRY_VERSION=1.0.0 +RUN apt update && apt install -y --no-install-recommends curl build-essential +RUN curl -sSL https://raw.githubusercontent.com/sdispater/poetry/master/get-poetry.py | python +WORKDIR $PYSETUP_PATH +COPY ./pyproject.toml ./ +RUN poetry install --no-dev + +FROM python-base as production +ENV FASTAPI_ENV=production +COPY --from=builder-base $VENV_PATH $VENV_PATH +COPY ./main.py $PYSETUP_PATH/main.py +ENTRYPOINT $PYSETUP_PATH/main.py diff --git a/.gitpod.yml b/.gitpod.yml new file mode 100644 index 0000000..8d6f604 --- /dev/null +++ b/.gitpod.yml @@ -0,0 +1,5 @@ +image: + file: .gitpod.Dockerfile + +tasks: + - init: poetry install --no-dev diff --git a/main.py b/main.py index 00ce6d1..e5794e1 100755 --- a/main.py +++ b/main.py @@ -3,7 +3,7 @@ import argparse import logging -from newspaper import Article, build +from newspaper import Article, build, Config from bs4 import BeautifulSoup from contextlib import closing from requests import get @@ -24,7 +24,7 @@ class Argparser(object): # TODO-maybe actually really do some logging def logError(err): - print(err) + logging.exception(err) def isAGoodResponse(resp): @@ -66,12 +66,18 @@ def getURLS(source): return result +def configNews(config): + config.fetch_images = False + config.keep_article_html = True + config.memoize_articles = False + config.browser_user_agent = "Chrome/91.0.4464.5" + + def main(): argparser = Argparser() + config = Config() + configNews(config) urls = getURLS(argparser.args.source) - # import sys - # print(urls) - # sys.exit(0) for url in urls: parser = build(url) for article in parser.articles: @@ -79,6 +85,7 @@ def main(): try: a.download() a.parse() + # print(a.html) print(a.text) except Exception as e: logging.exception(e) diff --git a/run.sh b/run.sh index f3f73d7..710ec7f 100755 --- a/run.sh +++ b/run.sh @@ -4,4 +4,6 @@ set -x # sniff --src https://github.com/coinpride/CryptoList --url | ./main.py > out2.txt ./main.py --source https://github.com/coinpride/CryptoList > out.txt +# ./main.py --source https://techurls.com/ > out.txt # cat out2.txt| pico2wave --wave=out2.wav +# ./main.py --source http://blog.terminaldweller.com > out.txt -- cgit v1.2.3