aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorterminaldweller <thabogre@gmail.com>2021-07-29 16:58:24 +0000
committerterminaldweller <thabogre@gmail.com>2021-07-29 16:58:24 +0000
commit5f67821f49a8e3c4573c4d3a8af977ff61dd51f3 (patch)
tree8205ae4a8e527d0d0e34a0821471f106ab7fc4a6
parentno need for externally getting the links. devourer can do that on its own now... (diff)
downloaddevourer-5f67821f49a8e3c4573c4d3a8af977ff61dd51f3.tar.gz
devourer-5f67821f49a8e3c4573c4d3a8af977ff61dd51f3.zip
added config for newspaper. added gitpod configs.
-rw-r--r--.gitpod.Dockerfile27
-rw-r--r--.gitpod.yml5
-rwxr-xr-xmain.py17
-rwxr-xr-xrun.sh2
4 files changed, 46 insertions, 5 deletions
diff --git a/.gitpod.Dockerfile b/.gitpod.Dockerfile
new file mode 100644
index 0000000..2f126fa
--- /dev/null
+++ b/.gitpod.Dockerfile
@@ -0,0 +1,27 @@
+FROM gitpod/workspace-full
+FROM python:3.8.11-slim as python-base
+ENV PYTHONUNBUFFERED=1 \
+ PYTHONDONTWRITEBYTECODE=1 \
+ PIP_NO_CACHE_DIR=off \
+ PIP_DISABLE_PIP_VERSION_CHECK=on \
+ PIP_DEFAULT_TIMEOUT=100 \
+ POETRY_HOME="/poetry" \
+ POETRY_VIRTUALENVS_IN_PROJECT=true \
+ POETRY_NO_INTERACTION=1 \
+ PYSETUP_PATH="/devourer" \
+ VENV_PATH="/devourer/.venv"
+ENV PATH="$POETRY_HOME/bin:$VENV_PATH/bin:$PATH"
+
+FROM python-base as builder-base
+ENV POETRY_VERSION=1.0.0
+RUN apt update && apt install -y --no-install-recommends curl build-essential
+RUN curl -sSL https://raw.githubusercontent.com/sdispater/poetry/master/get-poetry.py | python
+WORKDIR $PYSETUP_PATH
+COPY ./pyproject.toml ./
+RUN poetry install --no-dev
+
+FROM python-base as production
+ENV FASTAPI_ENV=production
+COPY --from=builder-base $VENV_PATH $VENV_PATH
+COPY ./main.py $PYSETUP_PATH/main.py
+ENTRYPOINT $PYSETUP_PATH/main.py
diff --git a/.gitpod.yml b/.gitpod.yml
new file mode 100644
index 0000000..8d6f604
--- /dev/null
+++ b/.gitpod.yml
@@ -0,0 +1,5 @@
+image:
+ file: .gitpod.Dockerfile
+
+tasks:
+ - init: poetry install --no-dev
diff --git a/main.py b/main.py
index 00ce6d1..e5794e1 100755
--- a/main.py
+++ b/main.py
@@ -3,7 +3,7 @@
import argparse
import logging
-from newspaper import Article, build
+from newspaper import Article, build, Config
from bs4 import BeautifulSoup
from contextlib import closing
from requests import get
@@ -24,7 +24,7 @@ class Argparser(object):
# TODO-maybe actually really do some logging
def logError(err):
- print(err)
+ logging.exception(err)
def isAGoodResponse(resp):
@@ -66,12 +66,18 @@ def getURLS(source):
return result
+def configNews(config):
+ config.fetch_images = False
+ config.keep_article_html = True
+ config.memoize_articles = False
+ config.browser_user_agent = "Chrome/91.0.4464.5"
+
+
def main():
argparser = Argparser()
+ config = Config()
+ configNews(config)
urls = getURLS(argparser.args.source)
- # import sys
- # print(urls)
- # sys.exit(0)
for url in urls:
parser = build(url)
for article in parser.articles:
@@ -79,6 +85,7 @@ def main():
try:
a.download()
a.parse()
+ # print(a.html)
print(a.text)
except Exception as e:
logging.exception(e)
diff --git a/run.sh b/run.sh
index f3f73d7..710ec7f 100755
--- a/run.sh
+++ b/run.sh
@@ -4,4 +4,6 @@ set -x
# sniff --src https://github.com/coinpride/CryptoList --url | ./main.py > out2.txt
./main.py --source https://github.com/coinpride/CryptoList > out.txt
+# ./main.py --source https://techurls.com/ > out.txt
# cat out2.txt| pico2wave --wave=out2.wav
+# ./main.py --source http://blog.terminaldweller.com > out.txt