removed the undocumented HTMLParser.unescape, replaced with _unescapeHTML; fixed a bug in the use of _unescapeHTML (missing _, from d6a9615347
)
This commit is contained in:
parent
c6f45d4314
commit
781cc523af
28
youtube-dl
28
youtube-dl
|
@ -308,13 +308,13 @@ def clean_html(html):
|
||||||
# Strip html tags
|
# Strip html tags
|
||||||
html = re.sub('<.*?>', '', html)
|
html = re.sub('<.*?>', '', html)
|
||||||
# Replace html entities
|
# Replace html entities
|
||||||
html = re.sub(ur'(?u)&(.+?);', htmlentity_transform, html)
|
html = _unescapeHTML(html)
|
||||||
return html
|
return html
|
||||||
|
|
||||||
|
|
||||||
def sanitize_title(utitle):
|
def sanitize_title(utitle):
|
||||||
"""Sanitizes a video title so it could be used as part of a filename."""
|
"""Sanitizes a video title so it could be used as part of a filename."""
|
||||||
utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
|
utitle = _unescapeHTML(utitle)
|
||||||
return utitle.replace(unicode(os.sep), u'%')
|
return utitle.replace(unicode(os.sep), u'%')
|
||||||
|
|
||||||
|
|
||||||
|
@ -371,8 +371,8 @@ def _unescapeHTML(s):
|
||||||
"""
|
"""
|
||||||
assert type(s) == type(u'')
|
assert type(s) == type(u'')
|
||||||
|
|
||||||
htmlParser = HTMLParser.HTMLParser()
|
result = re.sub(ur'(?u)&(.+?);', htmlentity_transform, s)
|
||||||
return htmlParser.unescape(s)
|
return result
|
||||||
|
|
||||||
def _encodeFilename(s):
|
def _encodeFilename(s):
|
||||||
"""
|
"""
|
||||||
|
@ -1324,8 +1324,8 @@ class YoutubeIE(InfoExtractor):
|
||||||
end = start + float(dur)
|
end = start + float(dur)
|
||||||
start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
|
start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
|
||||||
end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
|
end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
|
||||||
caption = re.sub(ur'(?u)&(.+?);', htmlentity_transform, caption)
|
caption = _unescapeHTML(caption)
|
||||||
caption = re.sub(ur'(?u)&(.+?);', htmlentity_transform, caption) # double cycle, inentional
|
caption = _unescapeHTML(caption) # double cycle, inentional
|
||||||
srt += str(n) + '\n'
|
srt += str(n) + '\n'
|
||||||
srt += start + ' --> ' + end + '\n'
|
srt += start + ' --> ' + end + '\n'
|
||||||
srt += caption + '\n\n'
|
srt += caption + '\n\n'
|
||||||
|
@ -2143,7 +2143,7 @@ class YahooIE(InfoExtractor):
|
||||||
self._downloader.trouble(u'ERROR: Unable to extract media URL')
|
self._downloader.trouble(u'ERROR: Unable to extract media URL')
|
||||||
return
|
return
|
||||||
video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
|
video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
|
||||||
video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
|
video_url = _unescapeHTML(video_url)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Process video information
|
# Process video information
|
||||||
|
@ -3410,11 +3410,11 @@ class EscapistIE(InfoExtractor):
|
||||||
return
|
return
|
||||||
|
|
||||||
descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
|
descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
|
||||||
description = unescapeHTML(descMatch.group(1))
|
description = _unescapeHTML(descMatch.group(1))
|
||||||
imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
|
imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
|
||||||
imgUrl = unescapeHTML(imgMatch.group(1))
|
imgUrl = _unescapeHTML(imgMatch.group(1))
|
||||||
playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
|
playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
|
||||||
playerUrl = unescapeHTML(playerUrlMatch.group(1))
|
playerUrl = _unescapeHTML(playerUrlMatch.group(1))
|
||||||
configUrlMatch = re.search('config=(.*)$', playerUrl)
|
configUrlMatch = re.search('config=(.*)$', playerUrl)
|
||||||
configUrl = urllib2.unquote(configUrlMatch.group(1))
|
configUrl = urllib2.unquote(configUrlMatch.group(1))
|
||||||
|
|
||||||
|
@ -3966,20 +3966,20 @@ class StanfordOpenClassroomIE(InfoExtractor):
|
||||||
|
|
||||||
m = re.search('<h1>([^<]+)</h1>', coursepage)
|
m = re.search('<h1>([^<]+)</h1>', coursepage)
|
||||||
if m:
|
if m:
|
||||||
info['title'] = unescapeHTML(m.group(1))
|
info['title'] = _unescapeHTML(m.group(1))
|
||||||
else:
|
else:
|
||||||
info['title'] = info['id']
|
info['title'] = info['id']
|
||||||
info['stitle'] = _simplify_title(info['title'])
|
info['stitle'] = _simplify_title(info['title'])
|
||||||
|
|
||||||
m = re.search('<description>([^<]+)</description>', coursepage)
|
m = re.search('<description>([^<]+)</description>', coursepage)
|
||||||
if m:
|
if m:
|
||||||
info['description'] = unescapeHTML(m.group(1))
|
info['description'] = _unescapeHTML(m.group(1))
|
||||||
|
|
||||||
links = _orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
|
links = _orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
|
||||||
info['list'] = [
|
info['list'] = [
|
||||||
{
|
{
|
||||||
'type': 'reference',
|
'type': 'reference',
|
||||||
'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
|
'url': 'http://openclassroom.stanford.edu/MainFolder/' + _unescapeHTML(vpage),
|
||||||
}
|
}
|
||||||
for vpage in links]
|
for vpage in links]
|
||||||
|
|
||||||
|
@ -4007,7 +4007,7 @@ class StanfordOpenClassroomIE(InfoExtractor):
|
||||||
info['list'] = [
|
info['list'] = [
|
||||||
{
|
{
|
||||||
'type': 'reference',
|
'type': 'reference',
|
||||||
'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
|
'url': 'http://openclassroom.stanford.edu/MainFolder/' + _unescapeHTML(cpage),
|
||||||
}
|
}
|
||||||
for cpage in links]
|
for cpage in links]
|
||||||
|
|
||||||
|
|
|
@ -308,13 +308,13 @@ def clean_html(html):
|
||||||
# Strip html tags
|
# Strip html tags
|
||||||
html = re.sub('<.*?>', '', html)
|
html = re.sub('<.*?>', '', html)
|
||||||
# Replace html entities
|
# Replace html entities
|
||||||
html = re.sub(ur'(?u)&(.+?);', htmlentity_transform, html)
|
html = _unescapeHTML(html)
|
||||||
return html
|
return html
|
||||||
|
|
||||||
|
|
||||||
def sanitize_title(utitle):
|
def sanitize_title(utitle):
|
||||||
"""Sanitizes a video title so it could be used as part of a filename."""
|
"""Sanitizes a video title so it could be used as part of a filename."""
|
||||||
utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
|
utitle = _unescapeHTML(utitle)
|
||||||
return utitle.replace(unicode(os.sep), u'%')
|
return utitle.replace(unicode(os.sep), u'%')
|
||||||
|
|
||||||
|
|
||||||
|
@ -371,8 +371,8 @@ def _unescapeHTML(s):
|
||||||
"""
|
"""
|
||||||
assert type(s) == type(u'')
|
assert type(s) == type(u'')
|
||||||
|
|
||||||
htmlParser = HTMLParser.HTMLParser()
|
result = re.sub(ur'(?u)&(.+?);', htmlentity_transform, s)
|
||||||
return htmlParser.unescape(s)
|
return result
|
||||||
|
|
||||||
def _encodeFilename(s):
|
def _encodeFilename(s):
|
||||||
"""
|
"""
|
||||||
|
@ -1324,8 +1324,8 @@ class YoutubeIE(InfoExtractor):
|
||||||
end = start + float(dur)
|
end = start + float(dur)
|
||||||
start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
|
start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
|
||||||
end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
|
end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
|
||||||
caption = re.sub(ur'(?u)&(.+?);', htmlentity_transform, caption)
|
caption = _unescapeHTML(caption)
|
||||||
caption = re.sub(ur'(?u)&(.+?);', htmlentity_transform, caption) # double cycle, inentional
|
caption = _unescapeHTML(caption) # double cycle, inentional
|
||||||
srt += str(n) + '\n'
|
srt += str(n) + '\n'
|
||||||
srt += start + ' --> ' + end + '\n'
|
srt += start + ' --> ' + end + '\n'
|
||||||
srt += caption + '\n\n'
|
srt += caption + '\n\n'
|
||||||
|
@ -2143,7 +2143,7 @@ class YahooIE(InfoExtractor):
|
||||||
self._downloader.trouble(u'ERROR: Unable to extract media URL')
|
self._downloader.trouble(u'ERROR: Unable to extract media URL')
|
||||||
return
|
return
|
||||||
video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
|
video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
|
||||||
video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
|
video_url = _unescapeHTML(video_url)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Process video information
|
# Process video information
|
||||||
|
@ -3410,11 +3410,11 @@ class EscapistIE(InfoExtractor):
|
||||||
return
|
return
|
||||||
|
|
||||||
descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
|
descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
|
||||||
description = unescapeHTML(descMatch.group(1))
|
description = _unescapeHTML(descMatch.group(1))
|
||||||
imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
|
imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
|
||||||
imgUrl = unescapeHTML(imgMatch.group(1))
|
imgUrl = _unescapeHTML(imgMatch.group(1))
|
||||||
playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
|
playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
|
||||||
playerUrl = unescapeHTML(playerUrlMatch.group(1))
|
playerUrl = _unescapeHTML(playerUrlMatch.group(1))
|
||||||
configUrlMatch = re.search('config=(.*)$', playerUrl)
|
configUrlMatch = re.search('config=(.*)$', playerUrl)
|
||||||
configUrl = urllib2.unquote(configUrlMatch.group(1))
|
configUrl = urllib2.unquote(configUrlMatch.group(1))
|
||||||
|
|
||||||
|
@ -3966,20 +3966,20 @@ class StanfordOpenClassroomIE(InfoExtractor):
|
||||||
|
|
||||||
m = re.search('<h1>([^<]+)</h1>', coursepage)
|
m = re.search('<h1>([^<]+)</h1>', coursepage)
|
||||||
if m:
|
if m:
|
||||||
info['title'] = unescapeHTML(m.group(1))
|
info['title'] = _unescapeHTML(m.group(1))
|
||||||
else:
|
else:
|
||||||
info['title'] = info['id']
|
info['title'] = info['id']
|
||||||
info['stitle'] = _simplify_title(info['title'])
|
info['stitle'] = _simplify_title(info['title'])
|
||||||
|
|
||||||
m = re.search('<description>([^<]+)</description>', coursepage)
|
m = re.search('<description>([^<]+)</description>', coursepage)
|
||||||
if m:
|
if m:
|
||||||
info['description'] = unescapeHTML(m.group(1))
|
info['description'] = _unescapeHTML(m.group(1))
|
||||||
|
|
||||||
links = _orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
|
links = _orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
|
||||||
info['list'] = [
|
info['list'] = [
|
||||||
{
|
{
|
||||||
'type': 'reference',
|
'type': 'reference',
|
||||||
'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
|
'url': 'http://openclassroom.stanford.edu/MainFolder/' + _unescapeHTML(vpage),
|
||||||
}
|
}
|
||||||
for vpage in links]
|
for vpage in links]
|
||||||
|
|
||||||
|
@ -4007,7 +4007,7 @@ class StanfordOpenClassroomIE(InfoExtractor):
|
||||||
info['list'] = [
|
info['list'] = [
|
||||||
{
|
{
|
||||||
'type': 'reference',
|
'type': 'reference',
|
||||||
'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
|
'url': 'http://openclassroom.stanford.edu/MainFolder/' + _unescapeHTML(cpage),
|
||||||
}
|
}
|
||||||
for cpage in links]
|
for cpage in links]
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue