#!/usr/bin/env python3 # feedme: download RSS/Atom feeds and convert to HTML, epub, Plucker, # or other formats suitable for offline reading on a handheld device, # # Copyright 2009-2023 by Akkana Peck # and licensed under the GPLv2 or later. # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details: # . from __future__ import print_function ConfigHelp = """Configuration options: Configuration options most useful in a DEFAULT section, applicable to all feeds: ascii Convert all pages to plain ASCII. Useful for reading devices like Palm that can't display other character sets reliably. block_nonlocal_images If images aren't downloaded, normally the img tag will still point to the image on the original website. But that can result in unwanted data use: people with limited or metered bandwidth can set this to True to remove the original image URL so it can't be downloaded when the story is viewed. dir Where to save the collected pages. See save_days for how long they will be kept. formats Comma-separated list of output formats. Default "none", which will result in HTML output. Other options: epub, fb2, plucker. logfile Save output (including debugging) to this log. verbose Print lots of debugging chatter while feeding. min_width The minimum number of characters in an item link. Links shorter than this will be padded to this length (to make tapping easier). Default 25. save_days How long to retain feeds locally. order The order of the feeds you'd like to see: an ordered list (one name per line) of the full names (not filenames) of each feed. Feed directories will have _01, _02 etc. prepended. Feeds not listed in order will be sorted alphabetically after the ordered ones. Configuration options you might want to reset for specific feeds: continue_on_timeout Normally, if one page times out, feedme will assume the site is down. On sites that link to content from many different URLs, set this to true. encoding Normally feedme will try to guess the encoding from the page. But some pages lie, so use this to override that. levels Level 1: only save the RSS page. Level 1.5: only read the RSS page, but make story pages from it Level 2: save sub-pages. nocache Don't check whether we've seen an entry before: collect everything. nonlocal_images Normally feedme will ignore images from other domains (usually ads). But some sites link to images from all over; set this to true in that case. page_start, page_end regexps that define the part of a page that will be fetched. skip_images Don't save images. Default true. skip_links: For sites with levels=1 where you just want a single news feed and never want to click on anything (e.g. slashdot), this can eliminate distracting links that you might tap on accidentally while scrolling. skip_pats Throw out anything matching these patterns url The RSS URL for the site. when When to check this site, if not every time. May be a weekday, e.g. Sat, or a month date, e.g. 1 to check only on the first day of any month. """ import time import os, sys import re #import types import shutil import traceback import feedparser import output_fmt import urllib.error import socket import posixpath import unicodedata # sheesh, this is apparently the recommended way to parse RFC 2822 dates: import email.utils as email_utils # FeedMe's module for parsing HTML inside feeds: import pageparser from tee import tee import msglog from cache import FeedmeCache from utils import falls_between, last_time_this_feed, expanduser # Rewriting image URLs to local ones import imagecache # utilities, mostly config-file related: import utils # For parsing sites that don't have RSS, just HTML with a list of links import htmlindex # Allow links in top page content. # Feedparser 6.0 has dropped _HTMLSanitizer.acceptable_elements, # but the documentation says that now it allows a and img by default. # https://pythonhosted.org/feedparser/html-sanitization.html try: feedparser._HTMLSanitizer.acceptable_elements.add('a') feedparser._HTMLSanitizer.acceptable_elements.add('img') except AttributeError: print("Don't know how to whitelist elements in feedparser", feedparser.__version__, file=sys.stderr) from bs4 import BeautifulSoup # For importing helper modules import importlib sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), "helpers")) verbose = False # # Clean up old feed directories # def clean_up(): try: days = int(utils.g_config.get('DEFAULT', 'save_days')) feedsdir = expanduser(utils.g_config.get('DEFAULT', 'dir')) cachedir = FeedmeCache.get_cache_dir() except: print("Error trying to get save_days and feed dir; can't clean up", file=sys.stderr) return now = time.time() def clean_up_dir(dirname, rmdir): '''If rmdir==True, remove (recursively) old directories, ignoring files at the toplevel. Otherwise remove old files at the toplevel. ''' for f in os.listdir(dirname): # Files never to delete: if f in ['feedme.dat', 'feeds.css', 'darkfeeds.css', 'LOG', 'urlrss.log' ]: continue f = os.path.join(dirname, f) # Logical xor: if rmdir is set, or it's not a directory, # but not both, then skip this entry. # ^ is bitwise xor but works if both args are bool. if rmdir ^ os.path.isdir(f): continue try: howold = (now - os.path.getctime(f)) / 60 / 60 / 24 if howold > days: print("Deleting", f, file=sys.stderr) if os.path.isdir(f): shutil.rmtree(f) else: os.unlink(f) except Exception as e: print("Couldn't unlink", f, str(e)) print("Cleaning up files older than %d days from feed and cache dirs" % days) clean_up_dir(feedsdir, True) clean_up_dir(cachedir, False) # # Ctrl-C Interrupt handler: prompt for what to do. # def handle_keyboard_interrupt(msg): # os.isatty() doesn't work, so: if not hasattr(sys.stdin, "isatty"): print("Interrupt, and not running interactively. Exiting.") sys.exit(1) try: response = input(msg) except EOFError: # This happens if we're run from a script rather than interactively # and yet someone sends a SIGINT, perhaps because we're timing out # and someone logged in to kick us back into operation. # In this case, pretend the user typed 'n', # meaning skip to next site. return 'n' if response == '': return '\0' if response[0] == 'q': sys.exit(1) return response[0] def parse_name_from_conf_file(feedfile): """Given the full pathname to a .conf file name, return the site name from the initial [The Site Name] line. """ with open(feedfile) as fp: for line in fp: m = re.match('^\b*\[(.*)\]\b*$', line) if m: return m.group(1) return None allow_unicode = False def slugify(value, allow_unicode=False): """ Taken from https://github.com/django/django/blob/master/django/utils/text.py Convert to ASCII if 'allow_unicode' is False. Convert spaces or repeated dashes to single underscores. Remove characters that aren't alphanumerics, underscores, or hyphens. Convert to lowercase. Also strip leading and trailing whitespace, dashes, and underscores. """ value = str(value) if allow_unicode: value = unicodedata.normalize('NFKC', value) else: value = unicodedata.normalize('NFKD', value) \ .encode('ascii', 'ignore').decode('ascii') value = re.sub(r'[^\w\s-]', '', value) return re.sub(r'[-\s]+', '_', value).strip('-_') # A number prepended to each feed if the user specifies feed order. g_feednum = 0 # # Get a single feed # def get_feed(feedname, cache, last_time, msglog): """Fetch a single site's feed. feedname can be the feed's config name ("Washington Post") or the conf file name ("washingtonpost" or "washingtonpost.conf"). """ global verbose, g_feednum verbose = (utils.g_config.get("DEFAULT", 'verbose').lower() == 'true') # Mandatory arguments: try: sitefeedurl = utils.g_config.get(feedname, 'url') feedsdir = utils.g_config.get(feedname, 'dir') except Exception as e: sitefeedurl = None # If feedname isn't a name in the config files, maybe it's the name # of a config file itself, e.g. if not "Foo News", # then maybe foonews or foonews.conf. if not sitefeedurl: fakefeedname = None if os.path.exists(feedname): # XXX This clause will accept the full path to a .conf file as # a commandline argument -- but that file will only be # used for the feed name, not for the actual feed parameters # or other config values, which probably isn't what the user # expects. The config object has already been initialized # by this time, and overwriting it is probably more work than # is warranted given that I never actually expect to use # config files from outside the configdir. fakefeedname = parse_name_from_conf_file(feedname) if fakefeedname: msglog.warn("Warning: Using name '%s' from %s," " but config parameters will actually be parsed " " from files in %s" % (fakefeedname, feedname, utils.g_default_confdir )) else: feedfile = os.path.join(utils.g_default_confdir, feedname) if os.path.exists(feedfile): fakefeedname = parse_name_from_conf_file(feedfile) if not sitefeedurl and not feedfile.endswith(".conf"): feedfile += ".conf" if os.path.exists(feedfile): fakefeedname = parse_name_from_conf_file(feedfile) if fakefeedname: try: sitefeedurl = utils.g_config.get(fakefeedname, 'url') feedsdir = utils.g_config.get(fakefeedname, 'dir') feedname = fakefeedname except: if verbose: print(feedname, "isn't a site feed name either", file=sys.stderr) if verbose: print("\n=============\nGetting %s feed" % feedname, file=sys.stderr) if not sitefeedurl: msglog.err("Can't find a config for: " + feedname) return verbose = (utils.g_config.get(feedname, 'verbose').lower() == 'true') levels = float(utils.g_config.get(feedname, 'levels')) feedsdir = expanduser(feedsdir) todaystr = time.strftime("%m-%d-%a") feedsdir = os.path.join(feedsdir, todaystr) formats = utils.g_config.get(feedname, 'formats').split(',') encoding = utils.g_config.get(feedname, 'encoding') ascii = utils.g_config.getboolean(feedname, 'ascii') skip_links = utils.g_config.getboolean(feedname, 'skip_links') skip_link_pats = utils.g_config.get_multiline(feedname, 'skip_link_pats') skip_title_pats = utils.g_config.get_multiline(feedname, 'skip_title_pats') user_agent = utils.g_config.get(feedname, 'user_agent') # Is this a feed we should only check occasionally? """Does this feed specify only gathering at certain times? If so, has such a time passed since the last time the cache file was written? """ when = utils.g_config.get(feedname, "when") if when and when != '' and last_time: if not falls_between(when, last_time, time.localtime()): print("Skipping", feedname, "-- not", when, file=sys.stderr) return print("Yes, it's time to feed:", when, file=sys.stderr) #encoding = utils.g_config.get(feedname, 'encoding') print("\n============\nfeedname:", feedname, file=sys.stderr) # Make it a legal and sane dirname feednamedir = slugify(feedname) # If the user specified an order, prepend its number g_feednum += 1 try: order = utils.g_config.get_multiline('DEFAULT', 'order') feednamedir = "%02d_%s" % (g_feednum, feednamedir) except: pass outdir = os.path.join(feedsdir, feednamedir) if verbose: print("feednamedir:", feednamedir, file=sys.stderr) print("outdir:", outdir, file=sys.stderr) # Get any helpers for this feed, if any. # A feed_helper takes precedence over a page_helper. # The helpers subdir has already been added to os.path, # at the end, so if the user has an earlier version # it will override a built-in of the same name. try: feed_helper = utils.g_config.get(feedname, 'feed_helper') except: feed_helper = None try: page_helper = utils.g_config.get(feedname, 'page_helper') except: page_helper = None if feed_helper or page_helper: # Read all helper args, which start with "helper_", # $D will map to today's datedir. # No tilde expansion will be done. # Turn them into a dictionary, e.g. # helper_executable_path = ~/firefox-esr # helper_log = $d/nyt_selenium.log # -> { # "executable_path": "~/firefox-esr", # "log": "/home/username/feeds/10-25-Mon/nyt_selenium.log" # } confoptions = utils.g_config.options(feedname) helper_args = {} for opt in confoptions: if opt.startswith("helper_"): key = opt[7:] if key: helper_args[key] = utils.g_config.get(feedname, opt) if '$f' in helper_args[key] and \ r'\$f' not in helper_args[key]: helper_args[key] = helper_args[key].replace("$f", outdir) if '$d' in helper_args[key] and \ r'\$d' not in helper_args[key]: helper_args[key] = helper_args[key].replace("$d", feedsdir) else: print("Skipping bad key '%s' in %s config file" % (opt, feedname), file-sys.stderr) if verbose: print(feedname, "helper args:", helper_args, file=sys.stderr) if feed_helper: if verbose: print("Trying to import", feed_helper) try: helpermod = importlib.import_module(feed_helper) except Exception as e: traceback.print_exc(file=sys.stderr) print("Couldn't import module '%s'" % feed_helper, file=sys.stderr) try: helpermod.fetch_feed(outdir, helper_args) if verbose: print("Fetched feed with %s(%s) to %s" % (feed_helper, helper_args, outdir)) except Exception as e: traceback.print_exc(file=sys.stderr) print("Couldn't run helper module '%s'" % feed_helper, file=sys.stderr) # Whether the copy helper was successful or not, # it's time to return. return else: # must be a page_helper if verbose: print("Trying to import", page_helper) try: helpermod = importlib.import_module(page_helper) if verbose: print("Initializing", page_helper, file=sys.stderr) helpermod.initialize(helper_args) except Exception as e: print("Couldn't import module '%s'" % page_helper, file=sys.stderr) traceback.print_exc(file=sys.stderr) return else: helpermod = None # When did we last run this feed? # This code is probably brittle so wrap it in try/except. last_fed_this = last_time_this_feed(cache, feedname) if verbose: print("Last fetched %s on %s" % (feedname, str(last_fed_this)), file=sys.stderr) if cache == None: nocache = True else: nocache = (utils.g_config.get(feedname, 'nocache') == 'true') if verbose and nocache: msglog.msg(feedname + ": Ignoring cache") downloaded_string ="\n
(Downloaded by " + \ utils.VersionString + ")\n" html_index_links = utils.g_config.get(feedname, 'html_index_links') # feedparser doesn't understand file:// URLs, so translate those # to a local file: # Ironically, with newer changes to feedparser, now file:// # is the only type it *does* handle reliably, and anything else # we have to fetch for it before it can parse. # if sitefeedurl.startswith('file://'): # sitefeedurl = sitefeedurl[7:] # feedparser.parse() can throw unexplained errors like # "xml.sax._exceptions.SAXException: Read failed (no details available)" # which will kill our whole process, so guard against that. # Sadly, feedparser usually doesn't give any details about what went wrong. socket.setdefaulttimeout(100) try: print("Running: feedparser.parse(%s)" % (sitefeedurl), file=sys.stderr) # Feedparser sometimes makes bogus decisions about charsets # fetched from http servers. # For instance, on http://www.lamonitor.com/todaysnews/rss.xml # some versions of feedparser (actually, some version of some # underlying library it uses, but since feedparser's documentation # is so sketchy and it's so inflexible, it's impossible to tell # exactly where the problem is) will ignore the encoding specified # in the feed and randomly decide to use something else. # For instance, http://www.lamonitor.com/todaysnews/rss.xml # specifies encoding="utf-8", but on Debian Jessie the parsed feed # has 'encoding': u'iso-8859-2'. # (Debian Stretch gets the right answer of utf-8. Even though # the versions of feedfetcher, chardet and urllib2 are identical.) # But if we fetch the RSS explicitly with urllib2 and pass it # as a string to feedfetcher.parse(), it doesn't do this. # feed = feedparser.parse(sitefeedurl) # HOWEVER: # If we do this on file:// URLs it causes an "unknown url type" # error -- no idea why. I just love feedparser so much. :-( if sitefeedurl.startswith("file://"): feed = feedparser.parse(sitefeedurl) elif html_index_links: feed = htmlindex.parse(feedname, html_index_links, verbose=verbose) else: downloader = pageparser.FeedmeURLDownloader(feedname, verbose=verbose) rss_str = downloader.download_url(sitefeedurl) feed = feedparser.parse(rss_str) rss_str = None response = None # except xml.sax._exceptions.SAXException, e: except urllib.error.HTTPError as e: print("HTTP error parsing URL:", sitefeedurl, file=sys.stderr) print(str(e), file=sys.stderr) return except pageparser.CookieError as e: msglog.err("No cookies, skipping site") msglog.err("Error was: %s" % e.message) msglog.err("Cookiefile details: %s\n" % e.longmessage) return except Exception as e: print("Couldn't parse feed: URL:", sitefeedurl, file=sys.stderr) print(str(e), file=sys.stderr) # raise(e) utils.ptraceback() # print(traceback.format_exc()) return # feedparser has no error return! One way is to check len(feed.feed). # Which makes no sense sicne feed is an object, why should it have a length? # if len(feed.feed) == 0: if len(feed.entries) == 0: msglog.err("Can't read " + sitefeedurl) return # XXX Sometimes feeds die a few lines later getting feed.feed.title. # Here's a braindead guard against it -- but why isn't this # whole clause inside a try? It should be. try: title = feed.feed.title except: title = None # if not 'title' in feed.feed: if not title: msglog.msg(sitefeedurl + " lacks a title!") feed.feed.title = '[' + feedname + ']' if cache and not nocache: try: feedcache = cache.thedict[sitefeedurl] except: feedcache = [] newfeedcache = [] # suburls: mapping of URLs we've encountered to local URLs. # Any anchors (#anchor) will be discarded. # This is for sites like WorldWideWords that make many links # to the same page. suburls = [] # Some sites, like Washington Post, repeat the same story # several times but with different URLs (and no ID specified). # The only way to tell we've seen them before is by title. titles = [] # indexstr is the contents of the index.html file. # Kept as a string until we know whether there are new, non-cached # stories so it's worth updating the copy on disk. # The stylesheet is for FeedViewer and shouldn't bother plucker etc. day = time.strftime("%a") indexstr = """\n %s: %s \n

%s: %s: %s

\n""" % (day, feedname, day, feedname, feed.feed.title) if verbose: print("********* Reading", sitefeedurl, file=sys.stderr) # A pattern to tell the user how to get to the next story: >-> # We also might want to remove that pattern later, in case # a story wasn't successfully downloaded -- so make a # regexp that can match it. next_item_string = '
\n
>->
\n
\n' next_item_pattern = '
\n
>->
\n
\n' try: urlrewrite = utils.g_config.get_multiline(feedname, 'story_url_rewrite') if urlrewrite: print("**** urlrewrite:", urlrewrite, file=sys.stderr) except: urlrewrite = None days = int(utils.g_config.get('DEFAULT', 'save_days')) too_old = time.time() - days * 60 * 60 * 24 print("too_old would be", too_old, "(now is", time.time(), ")", file=sys.stderr) # We'll increment itemnum as soon as we start showing entries, # so start it negative so anchor links will start at zero. itemnum = -1 last_page_written = None for item in feed.entries: try: # # Get the list of links (href) and a (hopefully) unique ID. # Make sure the id is a string; most of these components # inside item are bytes. # XXX Is href[] ever even used? Is this clause obsolete? # Answer: not obsolete, at least A Word A Day uses it. # But sometimes this clause will be triggered on a site # that doesn't have "links" in its RSS source # (e.g. Washington Post), which doesn't have href either. # if 'links' in item: href = [str(i['href']) for i in item.links if 'rel' in i and 'href' in i and i['rel'] == 'alternate'] else: href = [] if 'id' in item: item_id = str(item.id) if verbose: print("\nID %s" % item_id, file=sys.stderr) elif href: item_id = str(href[0]) if verbose: print("Using URL '%s' for ID" % item_id, file=sys.stderr) else: if verbose: print("Item in %s had no ID or URL." % str(href[0]), file=sys.stderr) continue # or return? # Whatever pattern we're using for the ID, it will need to # have spaces mapped to + before putting it in the cache. # So do that now. item_id = FeedmeCache.id_encode(item_id) # Does the link match a pattern we're skipping? item_link = str(item.link) if skip_link_pats: skipping = False for spat in skip_link_pats: if re.search(spat, item_link): skipping = True if verbose: print("Skipping", item_link, \ "because it matches", spat, file=sys.stderr) break if skipping: continue # How about the title? Does that match a skip pattern? item_title = str(item.title) if skip_title_pats: skipping = False for pat in skip_title_pats: if re.search(pat, item_title, flags=re.IGNORECASE): skipping = True if verbose: print("Skipping", item_link, \ "because of skip_title_pats " + pat, file=sys.stderr) break if skipping: continue # Filter out file types known not to work # XXX Only mp3 for now. Obviously, make this more general. # Wish we could do this using the server's type rather than # file extension! if item_link.endswith("mp3"): print("Filtering out mp3 link", item_link, file=sys.stderr) continue # Make sure ids don't have named anchors appended: anchor_index = item_id.rfind('#') if anchor_index >= 0: anchor = item_id[anchor_index:] item_id = item_id[0:anchor_index] else: anchor = "" # See if we've already seen this page's ID in this run. try: pagenum = suburls.index(item_id) # We've already seen a link to this URL. # That could mean it's a link to a different named anchor # within the same file, or it could mean that it's just # a duplicate (see below). if verbose: print("already seen item id", item_id, "this run: skipping", file=sys.stderr) continue except ValueError: if verbose: print("haven't seen item id", item_id, "yet this run", file=sys.stderr) # Is it a duplicate story that we've already seen in this run? # Some sites, like Washington Post, repeat the same stories # multiple times on their RSS feed, but stories won't be # added to our real feedcache until we've succeeded in # fetching the whole site. So check the temporary cache. # On the other hand, Risks Digest has a single story and # a lot of RSS entries with links to #name tags in that story. # So in that case we should include the entry but not # re-fetch the story. if newfeedcache and item_id in newfeedcache: if verbose: print("%s repeated today -- skipping" % item_id, file=sys.stderr) continue # How about the title? Have we already seen that before? # Washington Post runs the same story (same title) several # times with different URLs. # If we ever need to handle a feed where different stories # have the same title, this will have to be configurable. if item.title in titles: print('Skipping repeated title with a new ID: "%s", ID "%s"' \ % (item.title, item_id), file=sys.stderr) continue titles.append(item.title) # Get the published date. # item.pubDate is a unicode string, supposed to be in format # Thu, 11 Aug 2016 14:46:50 GMT (or +0000) # email.utils.parsedate returns a tuple. # Pass it to time.mktime() to get seconds since epoch. # XXX feedparser now has published_parsed which is # a time.struct_time. Can we count on that and not # have to do parsing here? try: pub_date = time.mktime(email_utils.parsedate(item.published)) except: pub_date = None # Haven't seen it yet this run. But is it in the cache already? if not nocache: # We want it in the cache, whether it's new or not: if verbose: print("Will cache as %s" % item_id, file=sys.stderr) if item_id not in newfeedcache: newfeedcache.append(item_id) if item_id in feedcache: if verbose: print("Seen it before, it's in the cache", file=sys.stderr) # We've seen this ID before. HOWEVER, it may still # be new: a site might have a static URL for the # monthly photo contest that gets updated once # a month with all-new content. if not utils.g_config.getboolean(feedname, 'allow_repeats'): if verbose: print(item_id, "already cached -- skipping", file=sys.stderr) continue # Repeats are allowed. So check the pub date. # XXX Unfortunately cache entries don't include # a date, so for now, allow repeat URLs if their # content was updated since the last feedme run. # This will unfortunately miss sites that # aren't checked every day. if verbose: print("Seen this before, but repeats are allowed") print("Last time this feed", last_fed_this) print("pub_date", pub_date) if pub_date <= last_fed_this: print("No new changes, skipping") continue print("Recent change, re-fetching", file=sys.stderr) elif verbose: print("'%s' is not in the cache -- fetching" % item_id, file=sys.stderr) # We're probably including this item. Add it to suburls. suburls.append(item_id) pagenum = len(suburls) - 1 # Sanity check: is the pubDate newer than the last # time we ran feedme? A negative answer isn't # necessarily a reason not to get it. # See if it's newer or older. If it's older, # we've probably seen it already; give a warning. if pub_date and last_fed_this: if verbose: print("Comparing pub_date %s to last_fed_this %s" % (str(pub_date), str(last_fed_this)), file=sys.stderr) if pub_date < last_fed_this and (verbose or not nocache): # If an entry is older than the maximum age # for the cache, skip it with a warning. if pub_date <= too_old and not nocache: msglog.warn("%s is so old (%s -> %s) it's expired from the cache -- skipping" \ % (item_id, str(item.published), str(pub_date))) # XXX Remove from suburls? continue # Else warn about it, but include it in the feed. msglog.warn("%s is older (%s) than the last time we updated this feed (%s)" \ % (item_id, str(item.published), time.strftime("%m-%d-%a-%y", time.gmtime(last_fed_this)))) else: print("%s: last updated %s, pubDate is %s" \ % (item_id, str(item.published), time.strftime("%m-%d-%a-%y", time.gmtime(last_fed_this)))) elif verbose and not pub_date: print(item_id, ": No pub_date!", file=sys.stderr) itemnum += 1 if verbose: print("Item:", item_title, file=sys.stderr) # Now itemnum is the number of the entry on the index page; # pagenum is the html file of the subentry, e.g. 3.html. # Make the directory for this feed if we haven't already if not os.access(outdir, os.W_OK): if verbose: print("Making", outdir, file=sys.stderr) os.makedirs(outdir) if 'author' in item: author = str(item.author) else: author = None content = get_content(item) # A parser is mostly needed for levels > 1, but even with # levels=1 we'll use it at the end for rewriting images # in the index string. parser = pageparser.FeedmeHTMLParser(feedname) # # If it's a normal multi-level site, # follow the link and make a file for it: # if levels > 1: try: # Try to trap keyboard interrupts, + others # For the sub-pages, we're getting HTML, not RSS. # Nobody seems to have RSS pointing to RSS. fnam = str(pagenum) + ".html" # Add a nextitem link in the footer to the next story, # Shouldn't do this for the last story; # but there's no easy way to tell if this is the last story, # because we don't know until we try whether the next # story will actually be fetched or not. footer = '
>-%d->
' \ % (itemnum+1, itemnum+1) # Add the page's URL to the footer: footer += downloaded_string footer += '\n
\n%s' % (item_link, item_link) if helpermod: try: htmlstr = helpermod.fetch_article(item_link) except Exception as e: print("Helper couldn't fetch", item_link, file=sys.stderr) traceback.print_exc(file=sys.stderr) continue if not htmlstr: if verbose: print("fetch failed on", item_link, file=sys.stderr) continue else: if levels == 1.5: # On sites that put the full story in the RSS # entry, we can use that for the story, # no need to fetch another file. htmlstr = content else: htmlstr = None if urlrewrite: if len(urlrewrite) == 2: oldlink = item_link item_link = re.sub(urlrewrite[0], urlrewrite[1], item_link) print("Rewrote", oldlink, "to", item_link, file=sys.stderr) else: print("story_url_rewrite had wrong # args:", len(urlrewrite), urlrewrite, file=sys.stderr) parser.fetch_url(item_link, outdir, fnam, title=item_title, author=author, footer=footer, html=htmlstr, user_agent=user_agent) # On level 1.5 sites, import any changes just made # to the final output. # On level 2 sites, pageparser didn't change the # index page, just sub-pages so this wouldn't help. # XXX should be a better way to get that from # the pageparser, and a way to get pageparser # to clean the index page for levels 1 and 2. if levels == 1.5: justwrotefile = os.path.join(outdir, fnam) if os.path.exists(justwrotefile): with open(justwrotefile) as justwrotefp: content = justwrotefp.read() elif verbose: print("File", justwrotefile, "doesn't exist, can't read back", file=sys.stderr) last_page_written = fnam except pageparser.NoContentError as e: # fetch_url didn't get the page or didn't write a file. # So don't increment pagenum or itemnum for the next story. msglog.warn("Didn't find any content on " + item_link + ": " + str(e)) # It is so annoying needing to repeat these # lines every time! Isn't there a way I can define # a subfunction that knows about this function's # local variables? itemnum -= 1 suburls.remove(item_id) # Include a note in the indexstr indexstr += '

No content for %s\n' \ % (item_link, item_title) continue # Catch timeouts. # If we get a timeout on a story, # we should assume the whole site has gone down, # and skip over the rest of the site. # XXX Though maybe we shouldn't give up til N timeouts. # In Python 2.6, instead of raising socket.timeout # a timeout will raise urllib2.URLerror with # e.reason set to socket.timeout. # XXX Should add a note to the index page about what happened. except socket.timeout as e: errmsg = "Socket.timeout error on title " + item_title errmsg += "\nBreaking -- hopefully we'll write index.html" msglog.err(errmsg) # Include a note in the indexstr indexstr += '

Socket timeout for %s\n' \ % (item_link, item_title) if utils.g_config.get(feedname, 'continue_on_timeout') == 'true': continue break # Handle timeouts in Python 2.6 except urllib.error.URLError as e: if isinstance(e.reason, socket.timeout): errmsg = "URLError Socket.timeout on title " errmsg += item_title errmsg += "\n" indexstr += "

" + errmsg if utils.g_config.get(feedname, 'continue_on_timeout') == 'true': errmsg += "continue_on_timeout is true" msglog.err(errmsg) continue errmsg += "Breaking -- hopefully we'll write index.html" msglog.err(errmsg) indexstr += \ '

Socket timeout for %s\n' \ % (item_link, item_title) break # Some other type of URLError. errmsg = 'URLError on %s
\n%s
\n' \ % (item_link, item_link, str(e)) msglog.err(errmsg) indexstr += "

" + errmsg + "" indexstr += '

URLerror for %s: %s\n' \ % (item_link, item_title, str(e)) continue except KeyboardInterrupt: response = handle_keyboard_interrupt(""" *** Caught keyboard interrupt reading a story! ***\n Options: q: Quit c: Continue trying to read this story s: Skip to next story n: Skip to next site Which (default = s): """) if response[0] == 'n' : # next site # XXX We should write an index.html here # with anything we've gotten so far. # Ideally we'd break out of the # for item in feed.entries : loop. # Wonder if there's a way to do that in python? # Failing that, and hoping it's the only # enclosing loop: print("Breaking -- hopefully we'll write an index.html") break #return elif response[0] != 'c' : # next story (default) continue # If the response was 'c', we continue and just # ignore the interrupt. except (IOError, urllib.error.HTTPError) as e: # Collect info about what went wrong: errmsg = "Couldn't read " + item_link + "\n" #errmsg += "Title: " + item_title if verbose: errmsg += str(e) + '
\n' #errmsg += str(sys.exc_info()[0]) + '
\n' #errmsg += str(sys.exc_info()[1]) + '
\n' # errmsg += traceback.format_exc() if verbose: print("==============", file=sys.stderr) msglog.err("IO or HTTP error: " + errmsg) if verbose: print("==============", file=sys.stderr) itemnum -= 1 suburls.remove(item_id) #raise # so this entry won't get stored or cached continue # Move on to next story except ValueError as e: # urllib2 is supposed to throw a urllib2.URLError for # "unknown url type", but in practice it throws ValueError. # See this e.g. for doubleclick ad links in the latimes # that have no spec, e.g. //ad.doubleclick.net/... # Unfortunately it seems to happen in other cases too, # so there's no way to separate out the urllib2 ones # except by string: str(sys.exc_info()[1]) starts with # "unknown url type:" errmsg = "ValueError on title " + item_title + "\n" # print >>sys.stderr, errmsg # msglog.err will print it, no need to print it again. if str(sys.exc_info()[1]).startswith("unknown url type:"): # Don't show stack trace for unknown URL types, # since it's a known error. errmsg += str(sys.exc_info()[1]) + " - couldn't load\n" msglog.warn(errmsg) else: errmsg += "ValueError on url " + item_link + "\n" ex_type, ex, tb = sys.exc_info() errmsg += traceback.format_exc(tb) msglog.err(errmsg) itemnum -= 1 suburls.remove(item_id) continue except Exception as e: # An unknown error, so report it complete with traceback. errmsg = "Unknown error reading " + item_link + "\n" errmsg += "Title: " + item_title if verbose: errmsg += "\nItem summary was:\n------\n" errmsg += str(item.summary) + "\n------\n" errmsg += str(e) + '
\n' # errmsg += str(sys.exc_info()[0]) + '
\n' # errmsg += str(sys.exc_info()[1]) + '
\n' errmsg += traceback.format_exc() if verbose: print("==============", file=sys.stderr) msglog.err("Unknown error: " + errmsg) if verbose: print("==============", file=sys.stderr) # Put a note in the index string indexstr += "

" + errmsg + "
" # Are we sure we didn't get anything? # Should we decrement itemnum, etc. ? continue # Move on to next story, ensure we get index # Done with if levels > 1 clause if not 'published_parsed' in item: if 'updated_parsed' in item: item.published_parsed = item.updated_parsed else: item.published_parsed = time.gmtime() # Plucker named anchors don't work unless preceded by a

# http://www.mail-archive.com/plucker-list@rubberchicken.org/msg07314.html indexstr += "

 " % itemnum # Make sure the link is at least some minimum width. # This is for viewers that have special areas defined on the # screen, e.g. areas for paging up/down or adjusting brightness. minwidth = utils.g_config.getint(feedname, 'min_width') if len(item_title) < minwidth: item_title += '. ' * (minwidth - len(item_title)) + '__' if levels > 1: itemlink = '' indexstr += itemlink + '' + item_title + '\n' else: # For a single-level site, don't put links over each entry. if skip_links: itemlink = None indexstr += "\n" + item_title + "\n" else: itemlink = '' indexstr += "\n" + itemlink + item_title + "\n" # Under the title, add a link to jump to the next entry. # If it's the last entry, we'll change it to "[end]" later. indexstr += next_item_string % (itemnum+1) if not content: content = "[No content]" # Sites that put too much formatting crap in the RSS: if utils.g_config.getboolean(feedname, 'simplify_rss'): content = pageparser.simplify_html(content) + " ... " # There's an increasing trend to load up RSS pages with images. # Try to remove them if skip_images is true, # as well as any links that contain only an image. if utils.g_config.getboolean(feedname, 'skip_images'): # XXX Rewrite to use parser rather than re content = re.sub(']*href=.*> *', '', content) content = re.sub('', '', content) # If fetching images, download/rewrite images in the RSS too. # But for levels==1.5, the RSS content has already passed # through pageparser once and has already had images rewritten. elif levels != 1.5 and content.strip(): content = imagecache.rewrite_images(content, sitefeedurl, outdir, feedname) # Try to get rid of embedded links if skip_links is true: if skip_links: content = re.sub('(.*?)', '\\1', content) # If we're keeping links, don't keep empty ones: else: content = re.sub(']*href=.*> *', '', content) # Skip any text specified in index_skip_content_pats. # Some sites (*cough* Pro Publica *cough*) do weird things # like putting huge