sharkcz / rpms / calibre

Forked from rpms/calibre 4 years ago
Clone
Blob Blame History Raw
From 39c86f23d401f9d7329d94fcbf32b51cbc003b8c Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Sat, 20 Jul 2019 12:40:26 +0530
Subject: [PATCH 03/71] Update WSJ

Fixes #1837213 [Private bug](https://bugs.launchpad.net/calibre/+bug/1837213)
---
 recipes/wsj.recipe      | 98 +++++++++++++++++++++++------------------
 recipes/wsj_free.recipe | 98 +++++++++++++++++++++++------------------
 2 files changed, 110 insertions(+), 86 deletions(-)

diff --git a/recipes/wsj.recipe b/recipes/wsj.recipe
index da28f081b3..f40f3fedfe 100644
--- a/recipes/wsj.recipe
+++ b/recipes/wsj.recipe
@@ -5,10 +5,7 @@
 from __future__ import absolute_import, division, print_function, unicode_literals
 
 import json
-try:
-    from urllib.parse import quote
-except ImportError:
-    from urllib import quote
+from base64 import standard_b64encode
 
 from mechanize import Request
 
@@ -16,6 +13,16 @@
 from calibre.web.feeds.news import BasicNewsRecipe
 from css_selectors import Select
 
+try:
+    import urllib.parse as urlparse
+except ImportError:
+    import urlparse
+try:
+    from urllib.parse import quote
+except ImportError:
+    from urllib import quote
+
+
 needs_subscription = True
 
 
@@ -40,7 +47,7 @@ class WSJ(BasicNewsRecipe):
     ignore_duplicate_articles = {'url'}
     remove_attributes = ['style', 'data-scrim']
     needs_subscription = needs_subscription
-    WSJ_ITP = 'https://online.wsj.com/itp/today'
+    WSJ_ITP = 'https://www.wsj.com/print-edition/today'
 
     keep_only_tags = [
         dict(classes('wsj-article-headline-wrap article_header bigTop__hed bigTop__dek bigTop__captioncredit')),
@@ -87,51 +94,56 @@ def get_cover_url(self):
     # login {{{
     if needs_subscription:
         def get_browser(self, *a, **kw):
-            # To understand the signin logic read signin.js from
-            # https://id.wsj.com/access/pages/wsj/us/signin.html
-            # This is the same login servie as used by Barrons
+            # To understand the login logic read app-min.js from
+            # https://sso.accounts.dowjones.com/login
+            itp = quote(self.WSJ_ITP, safe='')
+            start_url = 'https://accounts.wsj.com/login?target=' + itp
             kw['user_agent'] = random_user_agent(allow_ie=False)
             br = BasicNewsRecipe.get_browser(self, *a, **kw)
-            # self.wsj_itp_page = open('/t/raw.html').read()
-            # return br
-            url = 'https://id.wsj.com/access/pages/wsj/us/signin.html?mg=com-wsj&mg=id-wsj'
-            # br.set_debug_http(True)
-            br.open(url).read()
-            rurl = 'https://id.wsj.com/auth/submitlogin.json'
-            rq = Request(rurl, headers={
-                'Accept': 'application/json, text/javascript, */*; q=0.01',
+            self.log('Starting login process...')
+            res = br.open(start_url)
+            sso_url = res.geturl()
+            query =  urlparse.parse_qs(urlparse.urlparse(sso_url).query)
+            query = {k:v[0] for k, v in query.items()}
+            request_query = {
+                'username': self.username,
+                'password': self.password,
+                'client_id': query['client'],
+                'sso': 'true',
+                'tenant': 'sso',
+                '_intstate': 'deprecated',
+            }
+            for k in 'scope connection nonce state ui_locales ns protocol redirect_uri'.split():
+                request_query[k] = query[k]
+            login_url = 'https://sso.accounts.dowjones.com/usernamepassword/login'
+            # you can get the version below from lib-min.js
+            # search for: str: "x.x.x"
+            # This might need to be updated in the future
+            auth0_client = json.dumps({"name": "auth0.js", "version": "7.0.3"})
+            if not isinstance(auth0_client, bytes):
+                auth0_client = auth0_client.encode('utf-8')
+            auth0_client = standard_b64encode(auth0_client)
+            if isinstance(auth0_client, bytes):
+                auth0_client = auth0_client.decode('ascii')
+            rq = Request(login_url, headers={
+                'Accept': 'text/html',
                 'Accept-Language': 'en-US,en;q=0.8',
-                'Content-Type': 'application/json',
-                'Referer': url,
+                'Auth0-Client': auth0_client.rstrip('='),
                 'X-HTTP-Method-Override': 'POST',
                 'X-Requested-With': 'XMLHttpRequest',
-            }, data=json.dumps({
-                'username': self.username,
-                'password': self.password,
-                'realm': 'default',
-                'savelogin': 'true',
-                'template': 'default',
-                'url': quote(self.WSJ_ITP),
-            }))
-            r = br.open(rq)
-            if r.code != 200:
-                raise ValueError('Failed to login, check username and password')
-            data = json.loads(r.read())
-            # print(data)
-            if data.get('result') != 'success':
-                raise ValueError(
-                    'Failed to login (XHR failed), check username and password')
-            br.set_cookie('m', data['username'], '.wsj.com')
-            try:
-                r = br.open(data['url'])
-            except Exception:
-                self.log.error('Failed to open login url: {}'.format(data['url']))
-                raise
-            self.wsj_itp_page = raw = r.read()
+                'X-Remote-User': self.username
+            }, data=request_query)
+            self.log('Sending login request...')
+            res = br.open(rq)
+            if res.code != 200:
+                raise ValueError('Failed to login, check your username and password')
+            br.select_form(nr=0)
+            self.log('Performing login callback...')
+            res = br.submit()
+            self.wsj_itp_page = raw = res.read()
             if b'>Sign Out<' not in raw:
                 raise ValueError(
-                    'Failed to login (auth URL failed), check username and password')
-            # open('/t/raw.html', 'w').write(raw)
+                    'Failed to login (callback URL failed), check username and password')
             return br
     else:
         def get_browser(self, *a, **kw):
diff --git a/recipes/wsj_free.recipe b/recipes/wsj_free.recipe
index e04e210114..25726c0ca3 100644
--- a/recipes/wsj_free.recipe
+++ b/recipes/wsj_free.recipe
@@ -5,10 +5,7 @@
 from __future__ import absolute_import, division, print_function, unicode_literals
 
 import json
-try:
-    from urllib.parse import quote
-except ImportError:
-    from urllib import quote
+from base64 import standard_b64encode
 
 from mechanize import Request
 
@@ -16,6 +13,16 @@
 from calibre.web.feeds.news import BasicNewsRecipe
 from css_selectors import Select
 
+try:
+    import urllib.parse as urlparse
+except ImportError:
+    import urlparse
+try:
+    from urllib.parse import quote
+except ImportError:
+    from urllib import quote
+
+
 needs_subscription = False
 
 
@@ -40,7 +47,7 @@ class WSJ(BasicNewsRecipe):
     ignore_duplicate_articles = {'url'}
     remove_attributes = ['style', 'data-scrim']
     needs_subscription = needs_subscription
-    WSJ_ITP = 'https://online.wsj.com/itp/today'
+    WSJ_ITP = 'https://www.wsj.com/print-edition/today'
 
     keep_only_tags = [
         dict(classes('wsj-article-headline-wrap article_header bigTop__hed bigTop__dek bigTop__captioncredit')),
@@ -87,51 +94,56 @@ def get_cover_url(self):
     # login {{{
     if needs_subscription:
         def get_browser(self, *a, **kw):
-            # To understand the signin logic read signin.js from
-            # https://id.wsj.com/access/pages/wsj/us/signin.html
-            # This is the same login servie as used by Barrons
+            # To understand the login logic read app-min.js from
+            # https://sso.accounts.dowjones.com/login
+            itp = quote(self.WSJ_ITP, safe='')
+            start_url = 'https://accounts.wsj.com/login?target=' + itp
             kw['user_agent'] = random_user_agent(allow_ie=False)
             br = BasicNewsRecipe.get_browser(self, *a, **kw)
-            # self.wsj_itp_page = open('/t/raw.html').read()
-            # return br
-            url = 'https://id.wsj.com/access/pages/wsj/us/signin.html?mg=com-wsj&mg=id-wsj'
-            # br.set_debug_http(True)
-            br.open(url).read()
-            rurl = 'https://id.wsj.com/auth/submitlogin.json'
-            rq = Request(rurl, headers={
-                'Accept': 'application/json, text/javascript, */*; q=0.01',
+            self.log('Starting login process...')
+            res = br.open(start_url)
+            sso_url = res.geturl()
+            query =  urlparse.parse_qs(urlparse.urlparse(sso_url).query)
+            query = {k:v[0] for k, v in query.items()}
+            request_query = {
+                'username': self.username,
+                'password': self.password,
+                'client_id': query['client'],
+                'sso': 'true',
+                'tenant': 'sso',
+                '_intstate': 'deprecated',
+            }
+            for k in 'scope connection nonce state ui_locales ns protocol redirect_uri'.split():
+                request_query[k] = query[k]
+            login_url = 'https://sso.accounts.dowjones.com/usernamepassword/login'
+            # you can get the version below from lib-min.js
+            # search for: str: "x.x.x"
+            # This might need to be updated in the future
+            auth0_client = json.dumps({"name": "auth0.js", "version": "7.0.3"})
+            if not isinstance(auth0_client, bytes):
+                auth0_client = auth0_client.encode('utf-8')
+            auth0_client = standard_b64encode(auth0_client)
+            if isinstance(auth0_client, bytes):
+                auth0_client = auth0_client.decode('ascii')
+            rq = Request(login_url, headers={
+                'Accept': 'text/html',
                 'Accept-Language': 'en-US,en;q=0.8',
-                'Content-Type': 'application/json',
-                'Referer': url,
+                'Auth0-Client': auth0_client.rstrip('='),
                 'X-HTTP-Method-Override': 'POST',
                 'X-Requested-With': 'XMLHttpRequest',
-            }, data=json.dumps({
-                'username': self.username,
-                'password': self.password,
-                'realm': 'default',
-                'savelogin': 'true',
-                'template': 'default',
-                'url': quote(self.WSJ_ITP),
-            }))
-            r = br.open(rq)
-            if r.code != 200:
-                raise ValueError('Failed to login, check username and password')
-            data = json.loads(r.read())
-            # print(data)
-            if data.get('result') != 'success':
-                raise ValueError(
-                    'Failed to login (XHR failed), check username and password')
-            br.set_cookie('m', data['username'], '.wsj.com')
-            try:
-                r = br.open(data['url'])
-            except Exception:
-                self.log.error('Failed to open login url: {}'.format(data['url']))
-                raise
-            self.wsj_itp_page = raw = r.read()
+                'X-Remote-User': self.username
+            }, data=request_query)
+            self.log('Sending login request...')
+            res = br.open(rq)
+            if res.code != 200:
+                raise ValueError('Failed to login, check your username and password')
+            br.select_form(nr=0)
+            self.log('Performing login callback...')
+            res = br.submit()
+            self.wsj_itp_page = raw = res.read()
             if b'>Sign Out<' not in raw:
                 raise ValueError(
-                    'Failed to login (auth URL failed), check username and password')
-            # open('/t/raw.html', 'w').write(raw)
+                    'Failed to login (callback URL failed), check username and password')
             return br
     else:
         def get_browser(self, *a, **kw):