Derive variant display names from upstream data

This commit is contained in:
2026-04-24 10:50:55 +08:00
parent 89b89d4f19
commit 3fe6876ef0
6 changed files with 17961 additions and 17152 deletions
+72 -28
View File
@@ -463,6 +463,40 @@ def extract_codes(text: str) -> List[str]:
return [code.strip() for code in BACKTICK_RE.findall(text) if code.strip()]
def infer_base_variant_name(variant_name: str, entry_title: str) -> Optional[str]:
base = re.split(r"\s+/\s+", variant_name.strip(), maxsplit=1)[0].strip()
if not base:
return None
base = re.sub(r"\s*(?:国行版|国内版|中国版|印度版|欧洲版|国际版|北美版|日本版|韩国版|港版|台版|海外版)\s*$", "", base)
base = re.sub(
r"\s+(?:China|Chinese|India|Europe|European|Global|International|North America|North American|Japan|Korea|Hong Kong|Taiwan|US|USA|T-Mobile|Verizon|AT&T|SIM Free|SoftBank)\s*$",
"",
base,
flags=re.IGNORECASE,
)
base = " ".join(base.split())
if not base or normalize_text(base) not in normalize_text(entry_title):
return None
return base
def split_variant_groups(entry_title: str, title_codes: Iterable[str], variants: list[tuple[list[str], str]]) -> list[tuple[str, Set[str]]]:
groups: dict[str, Set[str]] = {}
for variant_codes, variant_name in variants:
base_name = infer_base_variant_name(variant_name, entry_title)
if not base_name:
return []
aliases = groups.setdefault(base_name, set(title_codes))
aliases.add(base_name)
aliases.add(variant_name)
aliases.update(variant_codes)
if len(groups) < 2:
return []
return list(groups.items())
def parse_brand_file(path: Path) -> List[DeviceRecord]:
file_stem = path.stem
brand = canonical_brand(file_stem)
@@ -473,43 +507,50 @@ def parse_brand_file(path: Path) -> List[DeviceRecord]:
section = ""
current_title = ""
current_title_codes: List[str] = []
current_aliases: Set[str] = set()
current_variants: list[tuple[list[str], str]] = []
def flush_current() -> None:
nonlocal current_title, current_aliases
nonlocal current_title, current_title_codes, current_aliases, current_variants
if not current_title:
return
aliases = sorted({alias.strip() for alias in current_aliases if alias.strip()})
record_id = f"{file_stem}:{len(records) + 1}"
device_type = infer_device_type(
device_name=current_title,
section=section,
source_file=path.name,
aliases=aliases,
default_type=default_type,
)
records.append(
DeviceRecord(
id=record_id,
device_name=current_title,
brand=brand,
manufacturer_brand=brand,
parent_brand=resolve_parent_brand(brand),
market_brand=infer_market_brand(
manufacturer_brand=brand,
device_name=current_title,
section=section,
aliases=aliases,
),
device_type=device_type,
aliases=aliases,
source_file=f"brands/{path.name}",
split_groups = split_variant_groups(current_title, current_title_codes, current_variants)
record_groups = split_groups or [(current_title, current_aliases)]
for device_name, raw_aliases in record_groups:
aliases = sorted({alias.strip() for alias in raw_aliases if alias.strip()})
record_id = f"{file_stem}:{len(records) + 1}"
device_type = infer_device_type(
device_name=device_name,
section=section,
source_file=path.name,
aliases=aliases,
default_type=default_type,
)
records.append(
DeviceRecord(
id=record_id,
device_name=device_name,
brand=brand,
manufacturer_brand=brand,
parent_brand=resolve_parent_brand(brand),
market_brand=infer_market_brand(
manufacturer_brand=brand,
device_name=device_name,
section=section,
aliases=aliases,
),
device_type=device_type,
aliases=aliases,
source_file=f"brands/{path.name}",
section=section,
)
)
)
current_title = ""
current_title_codes = []
current_aliases = set()
current_variants = []
for raw in lines:
line = raw.strip()
@@ -527,8 +568,10 @@ def parse_brand_file(path: Path) -> List[DeviceRecord]:
flush_current()
raw_title = entry_match.group(1).strip()
current_title = clean_entry_title(raw_title)
current_aliases = set(extract_codes(raw_title))
current_title_codes = extract_codes(raw_title)
current_aliases = set(current_title_codes)
current_aliases.add(current_title)
current_variants = []
continue
if not current_title:
@@ -538,6 +581,7 @@ def parse_brand_file(path: Path) -> List[DeviceRecord]:
if variant_match:
variant_codes = extract_codes(variant_match.group(1))
variant_name = variant_match.group(2).strip()
current_variants.append((variant_codes, variant_name))
current_aliases.update(variant_codes)
current_aliases.add(variant_name)