diff --git a/unshackle/core/tracks/subtitle.py b/unshackle/core/tracks/subtitle.py index 76c99af..4609c64 100644 --- a/unshackle/core/tracks/subtitle.py +++ b/unshackle/core/tracks/subtitle.py @@ -233,6 +233,7 @@ class Subtitle(Track): try: caption_set = pycaption.WebVTTReader().read(text) Subtitle.merge_same_cues(caption_set) + Subtitle.filter_unwanted_cues(caption_set) subtitle_text = pycaption.WebVTTWriter().write(caption_set) self.path.write_text(subtitle_text, encoding="utf8") except pycaption.exceptions.CaptionReadSyntaxError: @@ -241,6 +242,7 @@ class Subtitle(Track): try: caption_set = pycaption.WebVTTReader().read(text) Subtitle.merge_same_cues(caption_set) + Subtitle.filter_unwanted_cues(caption_set) subtitle_text = pycaption.WebVTTWriter().write(caption_set) self.path.write_text(subtitle_text, encoding="utf8") except Exception: @@ -444,6 +446,8 @@ class Subtitle(Track): caption_set = self.parse(self.path.read_bytes(), self.codec) Subtitle.merge_same_cues(caption_set) + if codec == Subtitle.Codec.WebVTT: + Subtitle.filter_unwanted_cues(caption_set) subtitle_text = writer().write(caption_set) output_path.write_text(subtitle_text, encoding="utf8") @@ -520,6 +524,8 @@ class Subtitle(Track): caption_set = self.parse(self.path.read_bytes(), self.codec) Subtitle.merge_same_cues(caption_set) + if codec == Subtitle.Codec.WebVTT: + Subtitle.filter_unwanted_cues(caption_set) subtitle_text = writer().write(caption_set) output_path.write_text(subtitle_text, encoding="utf8") @@ -681,6 +687,24 @@ class Subtitle(Track): if merged_captions: caption_set.set_captions(lang, merged_captions) + @staticmethod + def filter_unwanted_cues(caption_set: pycaption.CaptionSet): + """ + Filter out subtitle cues containing only   or whitespace. + """ + for lang in caption_set.get_languages(): + captions = caption_set.get_captions(lang) + filtered_captions = pycaption.CaptionList() + + for caption in captions: + text = caption.get_text().strip() + if not text or text == " " or all(c in " \t\n\r\xa0" for c in text.replace(" ", "\xa0")): + continue + + filtered_captions.append(caption) + + caption_set.set_captions(lang, filtered_captions) + @staticmethod def merge_segmented_wvtt(data: bytes, period_start: float = 0.0) -> tuple[CaptionList, Optional[str]]: """