mirror of
https://github.com/unshackle-dl/unshackle.git
synced 2025-10-23 15:11:08 +00:00
feat(subtitle): Add filtering for unwanted cues in WebVTT subtitles
This commit is contained in:
@@ -233,6 +233,7 @@ class Subtitle(Track):
|
||||
try:
|
||||
caption_set = pycaption.WebVTTReader().read(text)
|
||||
Subtitle.merge_same_cues(caption_set)
|
||||
Subtitle.filter_unwanted_cues(caption_set)
|
||||
subtitle_text = pycaption.WebVTTWriter().write(caption_set)
|
||||
self.path.write_text(subtitle_text, encoding="utf8")
|
||||
except pycaption.exceptions.CaptionReadSyntaxError:
|
||||
@@ -241,6 +242,7 @@ class Subtitle(Track):
|
||||
try:
|
||||
caption_set = pycaption.WebVTTReader().read(text)
|
||||
Subtitle.merge_same_cues(caption_set)
|
||||
Subtitle.filter_unwanted_cues(caption_set)
|
||||
subtitle_text = pycaption.WebVTTWriter().write(caption_set)
|
||||
self.path.write_text(subtitle_text, encoding="utf8")
|
||||
except Exception:
|
||||
@@ -444,6 +446,8 @@ class Subtitle(Track):
|
||||
|
||||
caption_set = self.parse(self.path.read_bytes(), self.codec)
|
||||
Subtitle.merge_same_cues(caption_set)
|
||||
if codec == Subtitle.Codec.WebVTT:
|
||||
Subtitle.filter_unwanted_cues(caption_set)
|
||||
subtitle_text = writer().write(caption_set)
|
||||
|
||||
output_path.write_text(subtitle_text, encoding="utf8")
|
||||
@@ -520,6 +524,8 @@ class Subtitle(Track):
|
||||
|
||||
caption_set = self.parse(self.path.read_bytes(), self.codec)
|
||||
Subtitle.merge_same_cues(caption_set)
|
||||
if codec == Subtitle.Codec.WebVTT:
|
||||
Subtitle.filter_unwanted_cues(caption_set)
|
||||
subtitle_text = writer().write(caption_set)
|
||||
|
||||
output_path.write_text(subtitle_text, encoding="utf8")
|
||||
@@ -681,6 +687,24 @@ class Subtitle(Track):
|
||||
if merged_captions:
|
||||
caption_set.set_captions(lang, merged_captions)
|
||||
|
||||
@staticmethod
|
||||
def filter_unwanted_cues(caption_set: pycaption.CaptionSet):
|
||||
"""
|
||||
Filter out subtitle cues containing only or whitespace.
|
||||
"""
|
||||
for lang in caption_set.get_languages():
|
||||
captions = caption_set.get_captions(lang)
|
||||
filtered_captions = pycaption.CaptionList()
|
||||
|
||||
for caption in captions:
|
||||
text = caption.get_text().strip()
|
||||
if not text or text == " " or all(c in " \t\n\r\xa0" for c in text.replace(" ", "\xa0")):
|
||||
continue
|
||||
|
||||
filtered_captions.append(caption)
|
||||
|
||||
caption_set.set_captions(lang, filtered_captions)
|
||||
|
||||
@staticmethod
|
||||
def merge_segmented_wvtt(data: bytes, period_start: float = 0.0) -> tuple[CaptionList, Optional[str]]:
|
||||
"""
|
||||
|
||||
Reference in New Issue
Block a user