diff options
author | terminaldweller <thabogre@gmail.com> | 2021-07-29 16:58:24 +0000 |
---|---|---|
committer | terminaldweller <thabogre@gmail.com> | 2021-07-29 16:58:24 +0000 |
commit | 5f67821f49a8e3c4573c4d3a8af977ff61dd51f3 (patch) | |
tree | 8205ae4a8e527d0d0e34a0821471f106ab7fc4a6 /main.py | |
parent | no need for externally getting the links. devourer can do that on its own now... (diff) | |
download | devourer-5f67821f49a8e3c4573c4d3a8af977ff61dd51f3.tar.gz devourer-5f67821f49a8e3c4573c4d3a8af977ff61dd51f3.zip |
added config for newspaper. added gitpod configs.
Diffstat (limited to '')
-rwxr-xr-x | main.py | 17 |
1 files changed, 12 insertions, 5 deletions
@@ -3,7 +3,7 @@ import argparse import logging -from newspaper import Article, build +from newspaper import Article, build, Config from bs4 import BeautifulSoup from contextlib import closing from requests import get @@ -24,7 +24,7 @@ class Argparser(object): # TODO-maybe actually really do some logging def logError(err): - print(err) + logging.exception(err) def isAGoodResponse(resp): @@ -66,12 +66,18 @@ def getURLS(source): return result +def configNews(config): + config.fetch_images = False + config.keep_article_html = True + config.memoize_articles = False + config.browser_user_agent = "Chrome/91.0.4464.5" + + def main(): argparser = Argparser() + config = Config() + configNews(config) urls = getURLS(argparser.args.source) - # import sys - # print(urls) - # sys.exit(0) for url in urls: parser = build(url) for article in parser.articles: @@ -79,6 +85,7 @@ def main(): try: a.download() a.parse() + # print(a.html) print(a.text) except Exception as e: logging.exception(e) |