From 172754131578f6042efa7c47a57c6e8531e3d190 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 13 Dec 2020 20:24:13 +0700 Subject: [PATCH] [extractor/common] Improve JSON-LD interaction statistic extraction (refs #23306) --- test/test_InfoExtractor.py | 50 ++++++++++++++++++++++++++++++++++ youtube_dl/extractor/common.py | 12 ++++++-- 2 files changed, 60 insertions(+), 2 deletions(-) diff --git a/test/test_InfoExtractor.py b/test/test_InfoExtractor.py index 644b3759c..8745f3aac 100644 --- a/test/test_InfoExtractor.py +++ b/test/test_InfoExtractor.py @@ -98,6 +98,56 @@ class TestInfoExtractor(unittest.TestCase): self.assertRaises(RegexNotFoundError, ie._html_search_meta, 'z', html, None, fatal=True) self.assertRaises(RegexNotFoundError, ie._html_search_meta, ('z', 'x'), html, None, fatal=True) + def test_search_json_ld_realworld(self): + # https://github.com/ytdl-org/youtube-dl/issues/23306 + expect_dict( + self, + self.ie._search_json_ld(r'''''', None), + { + 'title': '1 On 1 With Kleio', + 'description': 'Kleio Valentien', + 'url': 'https://gvideo.eporner.com/xN49A1cT3eB/xN49A1cT3eB.mp4', + 'timestamp': 1449347075, + 'duration': 743.0, + 'view_count': 1120958, + 'width': 1920, + 'height': 1080, + }) + + def test_download_json(self): uri = encode_data_uri(b'{"foo": "blah"}', 'application/json') self.assertEqual(self.ie._download_json(uri, None), {'foo': 'blah'}) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 460758ab8..79138f346 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1237,8 +1237,16 @@ class InfoExtractor(object): 'ViewAction': 'view', } + def extract_interaction_type(e): + interaction_type = e.get('interactionType') + if isinstance(interaction_type, dict): + interaction_type = interaction_type.get('@type') + return str_or_none(interaction_type) + def extract_interaction_statistic(e): interaction_statistic = e.get('interactionStatistic') + if isinstance(interaction_statistic, dict): + interaction_statistic = [interaction_statistic] if not isinstance(interaction_statistic, list): return for is_e in interaction_statistic: @@ -1246,8 +1254,8 @@ class InfoExtractor(object): continue if is_e.get('@type') != 'InteractionCounter': continue - interaction_type = is_e.get('interactionType') - if not isinstance(interaction_type, compat_str): + interaction_type = extract_interaction_type(is_e) + if not interaction_type: continue # For interaction count some sites provide string instead of # an integer (as per spec) with non digit characters (e.g. ",")