From 5da6bd00837236cf8a5dc5aeeadae5cfed7f2021 Mon Sep 17 00:00:00 2001 From: "Leslie P. Polzer" <polzer@gnu.org> Date: Fri, 20 Feb 2015 10:49:45 +0100 Subject: [PATCH 1/4] [chirbit] Add new extractor. --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/chirbit.py | 34 ++++++++++++++++++++++++++++++++ 2 files changed, 35 insertions(+) create mode 100644 youtube_dl/extractor/chirbit.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index f225ac654..de08e69bc 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -63,6 +63,7 @@ from .ccc import CCCIE from .ceskatelevize import CeskaTelevizeIE from .channel9 import Channel9IE from .chilloutzone import ChilloutzoneIE +from .chirbit import ChirbitIE from .cinchcast import CinchcastIE from .clipfish import ClipfishIE from .cliphunter import CliphunterIE diff --git a/youtube_dl/extractor/chirbit.py b/youtube_dl/extractor/chirbit.py new file mode 100644 index 000000000..06a3e1a7a --- /dev/null +++ b/youtube_dl/extractor/chirbit.py @@ -0,0 +1,34 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class ChirbitIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?chirb\.it/(?P<id>[^/]+)' + _TEST = { + 'url': 'http://chirb.it/PrIPv5', + 'md5': '9847b0dad6ac3e074568bf2cfb197de8', + 'info_dict': { + 'id': 'PrIPv5', + 'display_id': 'kukushtv_1423231243', + 'ext': 'mp3', + 'title': 'Фасадстрой', + 'url': 'http://audio.chirbit.com/kukushtv_1423231243.mp3' + } + } + + def _real_extract(self, url): + audio_linkid = self._match_id(url) + webpage = self._download_webpage(url, audio_linkid) + + audio_title = self._html_search_regex(r'<h2\s+itemprop="name">(.*?)</h2>', webpage, 'title') + audio_id = self._html_search_regex(r'\("setFile",\s+"http://audio.chirbit.com/(.*?).mp3"\)', webpage, 'audio ID') + audio_url = 'http://audio.chirbit.com/' + audio_id + '.mp3'; + + return { + 'id': audio_linkid, + 'display_id': audio_id, + 'title': audio_title, + 'url': audio_url + } From 365577f5676d63089cb834855dd4cdce7d0dc8aa Mon Sep 17 00:00:00 2001 From: "Leslie P. Polzer" <polzer@gnu.org> Date: Fri, 20 Feb 2015 14:48:12 +0100 Subject: [PATCH 2/4] [chirbit] add profile extractor. --- youtube_dl/extractor/__init__.py | 2 +- youtube_dl/extractor/chirbit.py | 63 ++++++++++++++++++++++++++++++++ 2 files changed, 64 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index de08e69bc..94e150826 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -63,7 +63,7 @@ from .ccc import CCCIE from .ceskatelevize import CeskaTelevizeIE from .channel9 import Channel9IE from .chilloutzone import ChilloutzoneIE -from .chirbit import ChirbitIE +from .chirbit import ChirbitIE, ChirbitProfileIE from .cinchcast import CinchcastIE from .clipfish import ClipfishIE from .cliphunter import CliphunterIE diff --git a/youtube_dl/extractor/chirbit.py b/youtube_dl/extractor/chirbit.py index 06a3e1a7a..47ce94aa0 100644 --- a/youtube_dl/extractor/chirbit.py +++ b/youtube_dl/extractor/chirbit.py @@ -1,7 +1,10 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor +from ..utils import clean_html class ChirbitIE(InfoExtractor): @@ -32,3 +35,63 @@ class ChirbitIE(InfoExtractor): 'title': audio_title, 'url': audio_url } + +class ChirbitProfileIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?chirbit.com/(?P<id>[^/]+)' + _TEST = { + 'url': 'http://chirbit.com/ScarletBeauty', + 'playlist_count': 3, + 'info_dict': { + '_type': 'playlist', + 'title': 'ScarletBeauty', + 'id': 'ScarletBeauty' + } + } + + def _real_extract(self, url): + profile_id = self._match_id(url) + + # Chirbit has a pretty weird "Last Page" navigation behavior. + # We grab the profile's oldest entry to determine when to + # stop fetching entries. + oldestpage = self._download_webpage(url + '/24599', profile_id) + oldest_page_entries = re.findall( + r'''soundFile:\s*"http://audio.chirbit.com/(.*?).mp3"''', + oldestpage); + oldestentry = clean_html(oldest_page_entries[-1]); + + ids = [] + titles = [] + n = 0 + while True: + page = self._download_webpage(url + '/' + str(n), profile_id) + page_ids = re.findall( + r'''soundFile:\s*"http://audio.chirbit.com/(.*?).mp3"''', + page); + page_titles = re.findall( + r'''<div\s+class="chirbit_title"\s*>(.*?)</div>''', + page); + ids += page_ids + titles += page_titles + if oldestentry in page_ids: + break + n += 1 + + entries = [] + i = 0 + for id in ids: + entries.append({ + 'id': id, + 'title': titles[i], + 'url': 'http://audio.chirbit.com/' + id + '.mp3' + }); + i += 1 + + info_dict = { + '_type': 'playlist', + 'id': profile_id, + 'title': profile_id, + 'entries': entries + } + + return info_dict; From ddc369f073fda4ddd429c2d9a104e561cefd417f Mon Sep 17 00:00:00 2001 From: "Leslie P. Polzer" <polzer@gnu.org> Date: Mon, 23 Feb 2015 12:00:43 +0100 Subject: [PATCH 3/4] [chirbit] fix profile downloader regex. --- youtube_dl/extractor/chirbit.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/chirbit.py b/youtube_dl/extractor/chirbit.py index 47ce94aa0..443192f43 100644 --- a/youtube_dl/extractor/chirbit.py +++ b/youtube_dl/extractor/chirbit.py @@ -37,7 +37,7 @@ class ChirbitIE(InfoExtractor): } class ChirbitProfileIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?chirbit.com/(?P<id>[^/]+)' + _VALID_URL = r'https?://(?:www\.)?chirbit.com/(?P<id>[^/]+)/?$' _TEST = { 'url': 'http://chirbit.com/ScarletBeauty', 'playlist_count': 3, From 93b5071f73738d788c878b38a57f2b6efe0da883 Mon Sep 17 00:00:00 2001 From: "Leslie P. Polzer" <polzer@gnu.org> Date: Mon, 23 Feb 2015 12:11:19 +0100 Subject: [PATCH 4/4] [soundgasm] add profile IE. --- youtube_dl/extractor/__init__.py | 5 ++++- youtube_dl/extractor/soundgasm.py | 36 +++++++++++++++++++++++++++++++ 2 files changed, 40 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 94e150826..cf58f0800 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -425,7 +425,10 @@ from .soundcloud import ( SoundcloudUserIE, SoundcloudPlaylistIE ) -from .soundgasm import SoundgasmIE +from .soundgasm import ( + SoundgasmIE, + SoundgasmProfileIE +) from .southpark import ( SouthParkIE, SouthparkDeIE, diff --git a/youtube_dl/extractor/soundgasm.py b/youtube_dl/extractor/soundgasm.py index a4f8ce6c3..e568ff18c 100644 --- a/youtube_dl/extractor/soundgasm.py +++ b/youtube_dl/extractor/soundgasm.py @@ -4,6 +4,7 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..utils import clean_html class SoundgasmIE(InfoExtractor): @@ -38,3 +39,38 @@ class SoundgasmIE(InfoExtractor): 'title': audio_title, 'description': description } + +class SoundgasmProfileIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?soundgasm\.net/u/(?P<id>[0-9a-zA-Z_\-]+)/?$' + _TEST = { + 'url': 'http://soundgasm.net/u/ytdl', + 'playlist_count': 1, + 'info_dict': { + '_type': 'playlist', + 'id': 'ytdl', + 'title': 'ytdl' + } + } + + def _real_extract(self, url): + profile_id = self._match_id(url) + webpage = self._download_webpage(url, profile_id) + + ids = re.findall(r'''<a\s+href=".+?/u/%s/([^/]+)">''' % re.escape(profile_id), webpage) + ids = [clean_html(id) for id in ids] + + entries = [] + for id in ids: + entries.append({ + '_type': 'url', + 'url': ('http://soundgasm.net/u/%s/%s' % (profile_id, id)) + }) + + info_dict = { + '_type': 'playlist', + 'id': profile_id, + 'title': profile_id, + 'entries': entries + } + + return info_dict;