mirror of
https://github.com/yt-dlp/yt-dlp.git
synced 2025-12-31 20:11:26 +00:00
[ie/youtube] Support comment subthreads (#15419)
* Support newly rolled out comment "subthreads" * Fix comments extraction: all replies were being missed * Add a `max-depth` element to the `max_comments` extractor-arg * Fully remove the deprecated `max_comment_depth` extractor-arg Closes #15303 Authored by: bashonly
This commit is contained in:
@@ -1859,8 +1859,9 @@ The following extractors use this feature:
|
||||
* `player_js_variant`: The player javascript variant to use for n/sig deciphering. The known variants are: `main`, `tcc`, `tce`, `es5`, `es6`, `tv`, `tv_es6`, `phone`, `tablet`. The default is `main`, and the others are for debugging purposes. You can use `actual` to go with what is prescribed by the site
|
||||
* `player_js_version`: The player javascript version to use for n/sig deciphering, in the format of `signature_timestamp@hash` (e.g. `20348@0004de42`). The default is to use what is prescribed by the site, and can be selected with `actual`
|
||||
* `comment_sort`: `top` or `new` (default) - choose comment sorting mode (on YouTube's side)
|
||||
* `max_comments`: Limit the amount of comments to gather. Comma-separated list of integers representing `max-comments,max-parents,max-replies,max-replies-per-thread`. Default is `all,all,all,all`
|
||||
* E.g. `all,all,1000,10` will get a maximum of 1000 replies total, with up to 10 replies per thread. `1000,all,100` will get a maximum of 1000 comments, with a maximum of 100 replies total
|
||||
* `max_comments`: Limit the amount of comments to gather. Comma-separated list of integers representing `max-comments,max-parents,max-replies,max-replies-per-thread,max-depth`. Default is `all,all,all,all,all`
|
||||
* A `max-depth` value of `1` will discard all replies, regardless of the `max-replies` or `max-replies-per-thread` values given
|
||||
* E.g. `all,all,1000,10,2` will get a maximum of 1000 replies total, with up to 10 replies per thread, and only 2 levels of depth (i.e. top-level comments plus their immediate replies). `1000,all,100` will get a maximum of 1000 comments, with a maximum of 100 replies total
|
||||
* `formats`: Change the types of formats to return. `dashy` (convert HTTP to DASH), `duplicate` (identical content but different URLs or protocol; includes `dashy`), `incomplete` (cannot be downloaded completely - live dash and post-live m3u8), `missing_pot` (include formats that require a PO Token but are missing one)
|
||||
* `innertube_host`: Innertube API host to use for all API requests; e.g. `studio.youtube.com`, `youtubei.googleapis.com`. Note that cookies exported from one subdomain will not work on others
|
||||
* `innertube_key`: Innertube API key to use for all API requests. By default, no API key is used
|
||||
|
||||
@@ -1065,7 +1065,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
|
||||
return next_continuation
|
||||
|
||||
return traverse_obj(renderer, (
|
||||
('contents', 'items', 'rows'), ..., 'continuationItemRenderer',
|
||||
('contents', 'items', 'rows', 'subThreads'), ..., 'continuationItemRenderer',
|
||||
('continuationEndpoint', ('button', 'buttonRenderer', 'command')),
|
||||
), get_all=False, expected_type=cls._extract_continuation_ep_data)
|
||||
|
||||
|
||||
@@ -1660,6 +1660,41 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
|
||||
'live_status': 'not_live',
|
||||
},
|
||||
'params': {'skip_download': True},
|
||||
}, {
|
||||
# Threaded comments with 4 levels of depth
|
||||
'url': 'https://www.youtube.com/watch?v=f6HNySwZV4c',
|
||||
'info_dict': {
|
||||
'id': 'f6HNySwZV4c',
|
||||
'ext': 'mp4',
|
||||
'title': 'dlptestvideo2',
|
||||
'description': '',
|
||||
'media_type': 'video',
|
||||
'uploader': 'cole-dlp-test-acc',
|
||||
'uploader_id': '@coletdjnz',
|
||||
'uploader_url': 'https://www.youtube.com/@coletdjnz',
|
||||
'channel': 'cole-dlp-test-acc',
|
||||
'channel_id': 'UCiu-3thuViMebBjw_5nWYrA',
|
||||
'channel_url': 'https://www.youtube.com/channel/UCiu-3thuViMebBjw_5nWYrA',
|
||||
'view_count': int,
|
||||
'like_count': int,
|
||||
'age_limit': 0,
|
||||
'duration': 5,
|
||||
'thumbnail': 'https://i.ytimg.com/vi/f6HNySwZV4c/maxresdefault.jpg',
|
||||
'categories': ['People & Blogs'],
|
||||
'tags': [],
|
||||
'timestamp': 1709856007,
|
||||
'upload_date': '20240308',
|
||||
'release_timestamp': 1709856007,
|
||||
'release_date': '20240308',
|
||||
'playable_in_embed': True,
|
||||
'availability': 'public',
|
||||
'live_status': 'not_live',
|
||||
'comment_count': 15,
|
||||
},
|
||||
'params': {
|
||||
'skip_download': True,
|
||||
'getcomments': True,
|
||||
},
|
||||
}]
|
||||
_WEBPAGE_TESTS = [{
|
||||
# <object>
|
||||
@@ -2437,6 +2472,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
|
||||
def extract_thread(contents, entity_payloads):
|
||||
if not parent:
|
||||
tracker['current_page_thread'] = 0
|
||||
|
||||
if max_depth < tracker['current_depth']:
|
||||
return
|
||||
|
||||
for content in contents:
|
||||
if not parent and tracker['total_parent_comments'] >= max_parents:
|
||||
yield
|
||||
@@ -2480,6 +2519,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
|
||||
'Detected YouTube comments looping. Stopping comment extraction '
|
||||
f'{"for this thread" if parent else ""} as we probably cannot get any more.')
|
||||
yield
|
||||
break # Safeguard for recursive call in subthreads code path below
|
||||
else:
|
||||
tracker['seen_comment_ids'].add(comment['id'])
|
||||
|
||||
@@ -2492,12 +2532,28 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
|
||||
comment_thread_renderer, lambda x: x['replies']['commentRepliesRenderer'], dict)
|
||||
|
||||
if comment_replies_renderer:
|
||||
subthreads = traverse_obj(comment_replies_renderer, (
|
||||
'subThreads', lambda _, v: v['commentThreadRenderer']))
|
||||
# Recursively extract from `commentThreadRenderer`s in `subThreads`
|
||||
if subthreads:
|
||||
tracker['current_depth'] += 1
|
||||
for entry in extract_thread(subthreads, entity_payloads):
|
||||
if entry:
|
||||
yield entry
|
||||
tracker['current_depth'] -= 1
|
||||
# All of the subThreads' `continuationItemRenderer`s were within the nested
|
||||
# `commentThreadRenderer`s and are now exhausted, so avoid unnecessary recursion below
|
||||
continue
|
||||
|
||||
tracker['current_page_thread'] += 1
|
||||
tracker['current_depth'] += 1
|
||||
# Recursively extract from `continuationItemRenderer`s in `subThreads`
|
||||
comment_entries_iter = self._comment_entries(
|
||||
comment_replies_renderer, ytcfg, video_id,
|
||||
parent=comment.get('id'), tracker=tracker)
|
||||
parent=comment_id, tracker=tracker)
|
||||
yield from itertools.islice(comment_entries_iter, min(
|
||||
max_replies_per_thread, max(0, max_replies - tracker['total_reply_comments'])))
|
||||
tracker['current_depth'] -= 1
|
||||
|
||||
# Keeps track of counts across recursive calls
|
||||
if not tracker:
|
||||
@@ -2509,19 +2565,14 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
|
||||
'total_reply_comments': 0,
|
||||
'seen_comment_ids': set(),
|
||||
'pinned_comment_ids': set(),
|
||||
'current_depth': 1,
|
||||
}
|
||||
|
||||
# TODO: Deprecated
|
||||
# YouTube comments have a max depth of 2
|
||||
max_depth = int_or_none(get_single_config_arg('max_comment_depth'))
|
||||
if max_depth:
|
||||
self._downloader.deprecated_feature('[youtube] max_comment_depth extractor argument is deprecated. '
|
||||
'Set max replies in the max-comments extractor argument instead')
|
||||
if max_depth == 1 and parent:
|
||||
return
|
||||
_max_comments, max_parents, max_replies, max_replies_per_thread, max_depth, *_ = (
|
||||
int_or_none(p, default=sys.maxsize) for p in self._configuration_arg('max_comments') + [''] * 5)
|
||||
|
||||
_max_comments, max_parents, max_replies, max_replies_per_thread, *_ = (
|
||||
int_or_none(p, default=sys.maxsize) for p in self._configuration_arg('max_comments') + [''] * 4)
|
||||
if max_depth < tracker['current_depth']:
|
||||
return
|
||||
|
||||
continuation = self._extract_continuation(root_continuation_data)
|
||||
|
||||
@@ -2550,6 +2601,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
|
||||
note_prefix = ' Downloading comment API JSON reply thread %d %s' % (
|
||||
tracker['current_page_thread'], comment_prog_str)
|
||||
else:
|
||||
# TODO: `parent` is only truthy in this code path with YT's legacy (non-threaded) comment view
|
||||
note_prefix = '{}Downloading comment{} API JSON page {} {}'.format(
|
||||
' ' if parent else '', ' replies' if parent else '',
|
||||
page_num, comment_prog_str)
|
||||
@@ -2566,6 +2618,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
|
||||
ep='next', ytcfg=ytcfg, headers=headers, note=note_prefix,
|
||||
check_get_keys=check_get_keys)
|
||||
except ExtractorError as e:
|
||||
# TODO: This code path is not reached since eb5bdbfa70126c7d5355cc0954b63720522e462c
|
||||
# Ignore incomplete data error for replies if retries didn't work.
|
||||
# This is to allow any other parent comments and comment threads to be downloaded.
|
||||
# See: https://github.com/yt-dlp/yt-dlp/issues/4669
|
||||
|
||||
Reference in New Issue
Block a user