feat(subtitles): Integrate subby library for enhanced subtitle processing and conversion methods

This commit is contained in:
Andy
2025-07-30 00:24:55 +00:00
parent 06c96b88a5
commit 5dad2746b1
4 changed files with 227 additions and 1 deletions

View File

@@ -57,6 +57,7 @@ dependencies = [
"pyplayready>=0.6.0,<0.7",
"httpx>=0.28.1,<0.29",
"cryptography>=45.0.0",
"subby",
]
[project.urls]
@@ -112,3 +113,4 @@ no_implicit_optional = true
[tool.uv.sources]
unshackle = { workspace = true }
subby = { git = "https://github.com/vevv/subby.git" }

View File

@@ -45,6 +45,7 @@ class Config:
self.curl_impersonate: dict = kwargs.get("curl_impersonate") or {}
self.remote_cdm: list[dict] = kwargs.get("remote_cdm") or []
self.credentials: dict = kwargs.get("credentials") or {}
self.subtitle: dict = kwargs.get("subtitle") or {}
self.directories = self._Directories()
for name, path in (kwargs.get("directories") or {}).items():

View File

@@ -15,9 +15,11 @@ from construct import Container
from pycaption import Caption, CaptionList, CaptionNode, WebVTTReader
from pycaption.geometry import Layout
from pymp4.parser import MP4
from subby import CommonIssuesFixer, SAMIConverter, SDHStripper, WebVTTConverter
from subtitle_filter import Subtitles
from unshackle.core import binaries
from unshackle.core.config import config
from unshackle.core.tracks.track import Track
from unshackle.core.utilities import try_ensure_utf8
from unshackle.core.utils.webvtt import merge_segmented_webvtt
@@ -30,6 +32,7 @@ class Subtitle(Track):
SubStationAlphav4 = "ASS" # https://wikipedia.org/wiki/SubStation_Alpha#Advanced_SubStation_Alpha=
TimedTextMarkupLang = "TTML" # https://wikipedia.org/wiki/Timed_Text_Markup_Language
WebVTT = "VTT" # https://wikipedia.org/wiki/WebVTT
SAMI = "SMI" # https://wikipedia.org/wiki/SAMI
# MPEG-DASH box-encapsulated subtitle formats
fTTML = "STPP" # https://www.w3.org/TR/2018/REC-ttml-imsc1.0.1-20180424
fVTT = "WVTT" # https://www.w3.org/TR/webvtt1
@@ -51,6 +54,8 @@ class Subtitle(Track):
return Subtitle.Codec.TimedTextMarkupLang
elif mime == "vtt":
return Subtitle.Codec.WebVTT
elif mime in ("smi", "sami"):
return Subtitle.Codec.SAMI
elif mime == "stpp":
return Subtitle.Codec.fTTML
elif mime == "wvtt":
@@ -306,10 +311,158 @@ class Subtitle(Track):
return "\n".join(sanitized_lines)
def convert_with_subby(self, codec: Subtitle.Codec) -> Path:
"""
Convert subtitle using subby library for better format support and processing.
This method leverages subby's advanced subtitle processing capabilities
including better WebVTT handling, SDH stripping, and common issue fixing.
"""
if not self.path or not self.path.exists():
raise ValueError("You must download the subtitle track first.")
if self.codec == codec:
return self.path
output_path = self.path.with_suffix(f".{codec.value.lower()}")
original_path = self.path
try:
# Convert to SRT using subby first
srt_subtitles = None
if self.codec == Subtitle.Codec.WebVTT:
converter = WebVTTConverter()
srt_subtitles = converter.from_file(str(self.path))
elif self.codec == Subtitle.Codec.SAMI:
converter = SAMIConverter()
srt_subtitles = converter.from_file(str(self.path))
if srt_subtitles is not None:
# Apply common fixes
fixer = CommonIssuesFixer()
fixed_srt, _ = fixer.from_srt(srt_subtitles)
# If target is SRT, we're done
if codec == Subtitle.Codec.SubRip:
output_path.write_text(str(fixed_srt), encoding="utf8")
else:
# Convert from SRT to target format using existing pycaption logic
temp_srt_path = self.path.with_suffix(".temp.srt")
temp_srt_path.write_text(str(fixed_srt), encoding="utf8")
# Parse the SRT and convert to target format
caption_set = self.parse(temp_srt_path.read_bytes(), Subtitle.Codec.SubRip)
self.merge_same_cues(caption_set)
writer = {
Subtitle.Codec.TimedTextMarkupLang: pycaption.DFXPWriter,
Subtitle.Codec.WebVTT: pycaption.WebVTTWriter,
}.get(codec)
if writer:
subtitle_text = writer().write(caption_set)
output_path.write_text(subtitle_text, encoding="utf8")
else:
# Fall back to existing conversion method
temp_srt_path.unlink()
return self._convert_standard(codec)
temp_srt_path.unlink()
if original_path.exists() and original_path != output_path:
original_path.unlink()
self.path = output_path
self.codec = codec
if callable(self.OnConverted):
self.OnConverted(codec)
return output_path
else:
# Fall back to existing conversion method
return self._convert_standard(codec)
except Exception:
# Fall back to existing conversion method on any error
return self._convert_standard(codec)
def convert(self, codec: Subtitle.Codec) -> Path:
"""
Convert this Subtitle to another Format.
The conversion method is determined by the 'conversion_method' setting in config:
- 'auto' (default): Uses subby for WebVTT/SAMI, standard for others
- 'subby': Always uses subby with CommonIssuesFixer
- 'subtitleedit': Uses SubtitleEdit when available, falls back to pycaption
- 'pycaption': Uses only pycaption library
"""
# Check configuration for conversion method
conversion_method = config.subtitle.get("conversion_method", "auto")
if conversion_method == "subby":
return self.convert_with_subby(codec)
elif conversion_method == "subtitleedit":
return self._convert_standard(codec) # SubtitleEdit is used in standard conversion
elif conversion_method == "pycaption":
return self._convert_pycaption_only(codec)
elif conversion_method == "auto":
# Use subby for formats it handles better
if self.codec in (Subtitle.Codec.WebVTT, Subtitle.Codec.SAMI):
return self.convert_with_subby(codec)
else:
return self._convert_standard(codec)
else:
return self._convert_standard(codec)
def _convert_pycaption_only(self, codec: Subtitle.Codec) -> Path:
"""
Convert subtitle using only pycaption library (no SubtitleEdit, no subby).
This is the original conversion method that only uses pycaption.
"""
if not self.path or not self.path.exists():
raise ValueError("You must download the subtitle track first.")
if self.codec == codec:
return self.path
output_path = self.path.with_suffix(f".{codec.value.lower()}")
original_path = self.path
# Use only pycaption for conversion
writer = {
Subtitle.Codec.SubRip: pycaption.SRTWriter,
Subtitle.Codec.TimedTextMarkupLang: pycaption.DFXPWriter,
Subtitle.Codec.WebVTT: pycaption.WebVTTWriter,
}.get(codec)
if writer is None:
raise NotImplementedError(f"Cannot convert {self.codec.name} to {codec.name} using pycaption only.")
caption_set = self.parse(self.path.read_bytes(), self.codec)
Subtitle.merge_same_cues(caption_set)
subtitle_text = writer().write(caption_set)
output_path.write_text(subtitle_text, encoding="utf8")
if original_path.exists() and original_path != output_path:
original_path.unlink()
self.path = output_path
self.codec = codec
if callable(self.OnConverted):
self.OnConverted(codec)
return output_path
def _convert_standard(self, codec: Subtitle.Codec) -> Path:
"""
Convert this Subtitle to another Format.
The file path location of the Subtitle data will be kept at the same
location but the file extension will be changed appropriately.
@@ -318,6 +471,7 @@ class Subtitle(Track):
- TimedTextMarkupLang - SubtitleEdit or pycaption.DFXPWriter
- WebVTT - SubtitleEdit or pycaption.WebVTTWriter
- SubStationAlphav4 - SubtitleEdit
- SAMI - subby.SAMIConverter (when available)
- fTTML* - custom code using some pycaption functions
- fVTT* - custom code using some pycaption functions
*: Can read from format, but cannot convert to format
@@ -416,6 +570,13 @@ class Subtitle(Track):
text = Subtitle.sanitize_broken_webvtt(text)
text = Subtitle.space_webvtt_headers(text)
caption_set = pycaption.WebVTTReader().read(text)
elif codec == Subtitle.Codec.SAMI:
# Use subby for SAMI parsing
converter = SAMIConverter()
srt_subtitles = converter.from_bytes(data)
# Convert SRT back to CaptionSet for compatibility
srt_text = str(srt_subtitles).encode("utf8")
caption_set = Subtitle.parse(srt_text, Subtitle.Codec.SubRip)
else:
raise ValueError(f'Unknown Subtitle format "{codec}"...')
except pycaption.exceptions.CaptionReadSyntaxError as e:
@@ -660,11 +821,45 @@ class Subtitle(Track):
def strip_hearing_impaired(self) -> None:
"""
Strip captions for hearing impaired (SDH).
It uses SubtitleEdit if available, otherwise filter-subs.
The SDH stripping method is determined by the 'sdh_method' setting in config:
- 'auto' (default): Tries subby first, then SubtitleEdit, then filter-subs
- 'subby': Uses subby's SDHStripper
- 'subtitleedit': Uses SubtitleEdit when available
- 'filter-subs': Uses subtitle-filter library
"""
if not self.path or not self.path.exists():
raise ValueError("You must download the subtitle track first.")
# Check configuration for SDH stripping method
sdh_method = config.subtitle.get("sdh_method", "auto")
if sdh_method == "subby" and self.codec == Subtitle.Codec.SubRip:
# Use subby's SDHStripper directly on the file
stripper = SDHStripper()
stripped_srt, _ = stripper.from_file(str(self.path))
self.path.write_text(str(stripped_srt), encoding="utf8")
return
elif sdh_method == "subtitleedit" and binaries.SubtitleEdit:
# Force use of SubtitleEdit
pass # Continue to SubtitleEdit section below
elif sdh_method == "filter-subs":
# Force use of filter-subs
sub = Subtitles(self.path)
sub.filter(rm_fonts=True, rm_ast=True, rm_music=True, rm_effects=True, rm_names=True, rm_author=True)
sub.save()
return
elif sdh_method == "auto":
# Try subby first for SRT files, then fall back
if self.codec == Subtitle.Codec.SubRip:
try:
stripper = SDHStripper()
stripped_srt, _ = stripper.from_file(str(self.path))
self.path.write_text(str(stripped_srt), encoding="utf8")
return
except Exception:
pass # Fall through to other methods
if binaries.SubtitleEdit:
if self.codec == Subtitle.Codec.SubStationAlphav4:
output_format = "AdvancedSubStationAlpha"

28
uv.lock generated
View File

@@ -1391,6 +1391,26 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/e7/9c/0e6afc12c269578be5c0c1c9f4b49a8d32770a080260c333ac04cc1c832d/soupsieve-2.7-py3-none-any.whl", hash = "sha256:6e60cc5c1ffaf1cebcc12e8188320b72071e922c2e897f737cadce79ad5d30c4", size = 36677, upload-time = "2025-04-20T18:50:07.196Z" },
]
[[package]]
name = "srt"
version = "3.5.3"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/66/b7/4a1bc231e0681ebf339337b0cd05b91dc6a0d701fa852bb812e244b7a030/srt-3.5.3.tar.gz", hash = "sha256:4884315043a4f0740fd1f878ed6caa376ac06d70e135f306a6dc44632eed0cc0", size = 28296, upload-time = "2023-03-28T02:35:44.007Z" }
[[package]]
name = "subby"
version = "0.3.21"
source = { git = "https://github.com/vevv/subby.git#390cb2f4a55e98057cdd65314d8cbffd5d0a11f1" }
dependencies = [
{ name = "beautifulsoup4" },
{ name = "click" },
{ name = "langcodes" },
{ name = "lxml" },
{ name = "pymp4" },
{ name = "srt" },
{ name = "tinycss" },
]
[[package]]
name = "subtitle-filter"
version = "1.5.0"
@@ -1400,6 +1420,12 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/10/40/c5d138e1f302b25240678943422a646feea52bab1f594c669c101c5e5070/subtitle_filter-1.5.0-py3-none-any.whl", hash = "sha256:6b506315be64870fba2e6894a70d76389407ce58c325fdf05129e0530f0a0f5b", size = 8346, upload-time = "2024-08-01T22:42:47.787Z" },
]
[[package]]
name = "tinycss"
version = "0.4"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/05/59/af583fff6236c7d2f94f8175c40ce501dcefb8d1b42e4bb7a2622dff689e/tinycss-0.4.tar.gz", hash = "sha256:12306fb50e5e9e7eaeef84b802ed877488ba80e35c672867f548c0924a76716e", size = 87759, upload-time = "2016-09-23T16:30:14.894Z" }
[[package]]
name = "tomli"
version = "2.2.1"
@@ -1510,6 +1536,7 @@ dependencies = [
{ name = "rlaphoenix-m3u8" },
{ name = "ruamel-yaml" },
{ name = "sortedcontainers" },
{ name = "subby" },
{ name = "subtitle-filter" },
{ name = "unidecode" },
{ name = "urllib3" },
@@ -1558,6 +1585,7 @@ requires-dist = [
{ name = "rlaphoenix-m3u8", specifier = ">=3.4.0,<4" },
{ name = "ruamel-yaml", specifier = ">=0.18.6,<0.19" },
{ name = "sortedcontainers", specifier = ">=2.4.0,<3" },
{ name = "subby", git = "https://github.com/vevv/subby.git" },
{ name = "subtitle-filter", specifier = ">=1.4.9,<2" },
{ name = "unidecode", specifier = ">=1.3.8,<2" },
{ name = "urllib3", specifier = ">=2.2.1,<3" },