[ie/youtube] Support comment subthreads (#15419)

* Support newly rolled out comment "subthreads" * Fix comments extraction: all replies were being missed * Add a `max-depth` element to the `max_comments` extractor-arg * Fully remove the deprecated `max_comment_depth` extractor-arg Closes #15303 Authored by: bashonly
2026-02-14 20:46:13 +00:00 · 2025-12-29 15:46:29 -06:00
parent abf29e3e72
commit d22436e5dc
3 changed files with 68 additions and 14 deletions
--- a/README.md
+++ b/README.md
@@ -1859,8 +1859,9 @@ The following extractors use this feature:
 * `player_js_variant`: The player javascript variant to use for n/sig deciphering. The known variants are: `main`, `tcc`, `tce`, `es5`, `es6`, `tv`, `tv_es6`, `phone`, `tablet`. The default is `main`, and the others are for debugging purposes. You can use `actual` to go with what is prescribed by the site
 * `player_js_version`: The player javascript version to use for n/sig deciphering, in the format of `signature_timestamp@hash` (e.g. `20348@0004de42`). The default is to use what is prescribed by the site, and can be selected with `actual`
 * `comment_sort`: `top` or `new` (default) - choose comment sorting mode (on YouTube's side)
-* `max_comments`: Limit the amount of comments to gather. Comma-separated list of integers representing `max-comments,max-parents,max-replies,max-replies-per-thread`. Default is `all,all,all,all`
-    * E.g. `all,all,1000,10` will get a maximum of 1000 replies total, with up to 10 replies per thread. `1000,all,100` will get a maximum of 1000 comments, with a maximum of 100 replies total
+* `max_comments`: Limit the amount of comments to gather. Comma-separated list of integers representing `max-comments,max-parents,max-replies,max-replies-per-thread,max-depth`. Default is `all,all,all,all,all`
+    * A `max-depth` value of `1` will discard all replies, regardless of the `max-replies` or `max-replies-per-thread` values given
+    * E.g. `all,all,1000,10,2` will get a maximum of 1000 replies total, with up to 10 replies per thread, and only 2 levels of depth (i.e. top-level comments plus their immediate replies). `1000,all,100` will get a maximum of 1000 comments, with a maximum of 100 replies total
 * `formats`: Change the types of formats to return. `dashy` (convert HTTP to DASH), `duplicate` (identical content but different URLs or protocol; includes `dashy`), `incomplete` (cannot be downloaded completely - live dash and post-live m3u8), `missing_pot` (include formats that require a PO Token but are missing one)
 * `innertube_host`: Innertube API host to use for all API requests; e.g. `studio.youtube.com`, `youtubei.googleapis.com`. Note that cookies exported from one subdomain will not work on others
 * `innertube_key`: Innertube API key to use for all API requests. By default, no API key is used
--- a/yt_dlp/extractor/youtube/_base.py
+++ b/yt_dlp/extractor/youtube/_base.py
@@ -1065,7 +1065,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
            return next_continuation

        return traverse_obj(renderer, (
-            ('contents', 'items', 'rows'), ..., 'continuationItemRenderer',
+            ('contents', 'items', 'rows', 'subThreads'), ..., 'continuationItemRenderer',
            ('continuationEndpoint', ('button', 'buttonRenderer', 'command')),
        ), get_all=False, expected_type=cls._extract_continuation_ep_data)

--- a/yt_dlp/extractor/youtube/_video.py
+++ b/yt_dlp/extractor/youtube/_video.py
@@ -1660,6 +1660,41 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
            'live_status': 'not_live',
        },
        'params': {'skip_download': True},
+    }, {
+        # Threaded comments with 4 levels of depth
+        'url': 'https://www.youtube.com/watch?v=f6HNySwZV4c',
+        'info_dict': {
+            'id': 'f6HNySwZV4c',
+            'ext': 'mp4',
+            'title': 'dlptestvideo2',
+            'description': '',
+            'media_type': 'video',
+            'uploader': 'cole-dlp-test-acc',
+            'uploader_id': '@coletdjnz',
+            'uploader_url': 'https://www.youtube.com/@coletdjnz',
+            'channel': 'cole-dlp-test-acc',
+            'channel_id': 'UCiu-3thuViMebBjw_5nWYrA',
+            'channel_url': 'https://www.youtube.com/channel/UCiu-3thuViMebBjw_5nWYrA',
+            'view_count': int,
+            'like_count': int,
+            'age_limit': 0,
+            'duration': 5,
+            'thumbnail': 'https://i.ytimg.com/vi/f6HNySwZV4c/maxresdefault.jpg',
+            'categories': ['People & Blogs'],
+            'tags': [],
+            'timestamp': 1709856007,
+            'upload_date': '20240308',
+            'release_timestamp': 1709856007,
+            'release_date': '20240308',
+            'playable_in_embed': True,
+            'availability': 'public',
+            'live_status': 'not_live',
+            'comment_count': 15,
+        },
+        'params': {
+            'skip_download': True,
+            'getcomments': True,
+        },
    }]
    _WEBPAGE_TESTS = [{
        # <object>
@@ -2437,6 +2472,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
        def extract_thread(contents, entity_payloads):
            if not parent:
                tracker['current_page_thread'] = 0
+
+            if max_depth < tracker['current_depth']:
+                return
+
            for content in contents:
                if not parent and tracker['total_parent_comments'] >= max_parents:
                    yield
@@ -2480,6 +2519,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                        'Detected YouTube comments looping. Stopping comment extraction '
                        f'{"for this thread" if parent else ""} as we probably cannot get any more.')
                    yield
+                    break  # Safeguard for recursive call in subthreads code path below
                else:
                    tracker['seen_comment_ids'].add(comment['id'])

@@ -2492,12 +2532,28 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                    comment_thread_renderer, lambda x: x['replies']['commentRepliesRenderer'], dict)

                if comment_replies_renderer:
+                    subthreads = traverse_obj(comment_replies_renderer, (
+                        'subThreads', lambda _, v: v['commentThreadRenderer']))
+                    # Recursively extract from `commentThreadRenderer`s in `subThreads`
+                    if subthreads:
+                        tracker['current_depth'] += 1
+                        for entry in extract_thread(subthreads, entity_payloads):
+                            if entry:
+                                yield entry
+                        tracker['current_depth'] -= 1
+                        # All of the subThreads' `continuationItemRenderer`s were within the nested
+                        # `commentThreadRenderer`s and are now exhausted, so avoid unnecessary recursion below
+                        continue
+
                    tracker['current_page_thread'] += 1
+                    tracker['current_depth'] += 1
+                    # Recursively extract from `continuationItemRenderer`s in `subThreads`
                    comment_entries_iter = self._comment_entries(
                        comment_replies_renderer, ytcfg, video_id,
-                        parent=comment.get('id'), tracker=tracker)
+                        parent=comment_id, tracker=tracker)
                    yield from itertools.islice(comment_entries_iter, min(
                        max_replies_per_thread, max(0, max_replies - tracker['total_reply_comments'])))
+                    tracker['current_depth'] -= 1

        # Keeps track of counts across recursive calls
        if not tracker:
@@ -2509,19 +2565,14 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                'total_reply_comments': 0,
                'seen_comment_ids': set(),
                'pinned_comment_ids': set(),
+                'current_depth': 1,
            }

-        # TODO: Deprecated
-        # YouTube comments have a max depth of 2
-        max_depth = int_or_none(get_single_config_arg('max_comment_depth'))
-        if max_depth:
-            self._downloader.deprecated_feature('[youtube] max_comment_depth extractor argument is deprecated. '
-                                                'Set max replies in the max-comments extractor argument instead')
-        if max_depth == 1 and parent:
-            return
+        _max_comments, max_parents, max_replies, max_replies_per_thread, max_depth, *_ = (
+            int_or_none(p, default=sys.maxsize) for p in self._configuration_arg('max_comments') + [''] * 5)

-        _max_comments, max_parents, max_replies, max_replies_per_thread, *_ = (
-            int_or_none(p, default=sys.maxsize) for p in self._configuration_arg('max_comments') + [''] * 4)
+        if max_depth < tracker['current_depth']:
+            return

        continuation = self._extract_continuation(root_continuation_data)

@@ -2550,6 +2601,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                    note_prefix = '    Downloading comment API JSON reply thread %d %s' % (
                        tracker['current_page_thread'], comment_prog_str)
            else:
+                # TODO: `parent` is only truthy in this code path with YT's legacy (non-threaded) comment view
                note_prefix = '{}Downloading comment{} API JSON page {} {}'.format(
                    '       ' if parent else '', ' replies' if parent else '',
                    page_num, comment_prog_str)
@@ -2566,6 +2618,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                    ep='next', ytcfg=ytcfg, headers=headers, note=note_prefix,
                    check_get_keys=check_get_keys)
            except ExtractorError as e:
+                # TODO: This code path is not reached since eb5bdbfa70126c7d5355cc0954b63720522e462c
                # Ignore incomplete data error for replies if retries didn't work.
                # This is to allow any other parent comments and comment threads to be downloaded.
                # See: https://github.com/yt-dlp/yt-dlp/issues/4669