sharkcz / rpms / calibre

Forked from rpms/calibre 4 years ago
Clone
Blob Blame History Raw
From 252fa24e4b6bb1f8503b134fd871f3cd29c98882 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Thu, 8 Aug 2019 10:05:59 +0530
Subject: [PATCH 43/71] Update National Geographic

---
 recipes/natgeo.recipe | 65 +++++++++++++++++++++++++++++++------------
 1 file changed, 47 insertions(+), 18 deletions(-)

diff --git a/recipes/natgeo.recipe b/recipes/natgeo.recipe
index af9b4e4c9f..84061bc1e2 100644
--- a/recipes/natgeo.recipe
+++ b/recipes/natgeo.recipe
@@ -3,6 +3,8 @@
 from __future__ import absolute_import, division, print_function, unicode_literals
 
 import json
+from collections import defaultdict
+
 from calibre.ebooks.BeautifulSoup import Tag
 from calibre.web.feeds.news import BasicNewsRecipe
 
@@ -20,17 +22,35 @@ def new_tag(soup, name, attrs=()):
     return Tag(soup, name, attrs=attrs or None)
 
 
+def entry_to_article(entry):
+    url = entry.get('uri')
+    if not url:
+        return None, None
+    section = 'Articles'
+    article = {'url': url}
+    for component in entry.get('components', ()):
+        if component.get('content_type') == 'title':
+            article['title'] = component['title']['text']
+        elif component.get('content_type') == 'kicker':
+            v = component['kicker'].get('vertical') or {}
+            if v.get('name'):
+                section = v['name']
+        elif component.get('content_type') == 'dek':
+            if component['dek'].get('text'):
+                article['description'] = component['dek']['text']
+    if 'title' in article:
+        return article, section
+    return None, None
+
+
 class NatGeo(BasicNewsRecipe):
     title = u'National Geographic'
     description = 'Daily news articles from The National Geographic'
     language = 'en'
-    oldest_article = 20
-    max_articles_per_feed = 25
     encoding = 'utf8'
     publisher = 'nationalgeographic.com'
     category = 'science, nat geo'
     __author__ = 'Kovid Goyal'
-    masthead_url = 'http://s.ngeo.com/wpf/sites/themes/global/i/presentation/ng_logo_small.png'
     description = 'Inspiring people to care about the planet since 1888'
     timefmt = ' [%a, %d %b, %Y]'
     no_stylesheets = True
@@ -39,25 +59,34 @@ class NatGeo(BasicNewsRecipe):
     remove_javascript = False
 
     keep_only_tags = [
-            classes('mainArt byline'),
-            dict(id='article__body'),
+        classes('main-title article__dek byline-component publishDate mainArt byline'),
+        dict(id='article__body'),
     ]
     remove_tags = [
-            classes('hide-from-mobile ad-holder enlarge-button'),
-            dict(name='svg meta'.split()),
+        classes('hide-from-mobile ad-holder enlarge-button'),
+        dict(name='svg meta'.split()),
     ]
 
-    feeds = [
-        (u'Daily News', u'http://feeds.nationalgeographic.com/ng/News/News_Main')
-    ]
-
-    def parse_feeds(self):
-        feeds = BasicNewsRecipe.parse_feeds(self)
-        for feed in feeds:
-            for article in feed.articles[:]:
-                if 'Presented' in article.title or 'Pictures' in article.title:
-                    feed.articles.remove(article)
-        return feeds
+    def parse_index(self):
+        feeds = defaultdict(list)
+        br = self.get_browser()
+        # br.open('https://www.nationalgeographic.com/latest-stories/').read()
+        res = br.open_novisit(
+            'https://www.nationalgeographic.com/latest-stories/_jcr_content/content/hubfeed.promo-hub-feed-all-stories.json?offset=0&max=18')
+        entries = json.loads(res.read())
+        for entry in entries:
+            art, section = entry_to_article(entry)
+            if art is None:
+                continue
+            feeds[section].append(art)
+        ans = [(sec, feeds[sec]) for sec in sorted(feeds) if feeds[sec]]
+        for (sec, articles) in ans:
+            self.log('Found section:', sec)
+            for art in articles:
+                self.log('\t', art['title'], art['url'])
+                if 'description' in art:
+                    self.log('\t\t', art['description'])
+        return ans
 
     def preprocess_html(self, soup):
         for div in soup.findAll(attrs={'data-pestle-module': 'PictureFill'}):