Derive variant display names from upstream data
This commit is contained in:
+72
-28
@@ -463,6 +463,40 @@ def extract_codes(text: str) -> List[str]:
|
||||
return [code.strip() for code in BACKTICK_RE.findall(text) if code.strip()]
|
||||
|
||||
|
||||
def infer_base_variant_name(variant_name: str, entry_title: str) -> Optional[str]:
|
||||
base = re.split(r"\s+/\s+", variant_name.strip(), maxsplit=1)[0].strip()
|
||||
if not base:
|
||||
return None
|
||||
|
||||
base = re.sub(r"\s*(?:国行版|国内版|中国版|印度版|欧洲版|国际版|北美版|日本版|韩国版|港版|台版|海外版)\s*$", "", base)
|
||||
base = re.sub(
|
||||
r"\s+(?:China|Chinese|India|Europe|European|Global|International|North America|North American|Japan|Korea|Hong Kong|Taiwan|US|USA|T-Mobile|Verizon|AT&T|SIM Free|SoftBank)\s*$",
|
||||
"",
|
||||
base,
|
||||
flags=re.IGNORECASE,
|
||||
)
|
||||
base = " ".join(base.split())
|
||||
if not base or normalize_text(base) not in normalize_text(entry_title):
|
||||
return None
|
||||
return base
|
||||
|
||||
|
||||
def split_variant_groups(entry_title: str, title_codes: Iterable[str], variants: list[tuple[list[str], str]]) -> list[tuple[str, Set[str]]]:
|
||||
groups: dict[str, Set[str]] = {}
|
||||
for variant_codes, variant_name in variants:
|
||||
base_name = infer_base_variant_name(variant_name, entry_title)
|
||||
if not base_name:
|
||||
return []
|
||||
aliases = groups.setdefault(base_name, set(title_codes))
|
||||
aliases.add(base_name)
|
||||
aliases.add(variant_name)
|
||||
aliases.update(variant_codes)
|
||||
|
||||
if len(groups) < 2:
|
||||
return []
|
||||
return list(groups.items())
|
||||
|
||||
|
||||
def parse_brand_file(path: Path) -> List[DeviceRecord]:
|
||||
file_stem = path.stem
|
||||
brand = canonical_brand(file_stem)
|
||||
@@ -473,43 +507,50 @@ def parse_brand_file(path: Path) -> List[DeviceRecord]:
|
||||
|
||||
section = ""
|
||||
current_title = ""
|
||||
current_title_codes: List[str] = []
|
||||
current_aliases: Set[str] = set()
|
||||
current_variants: list[tuple[list[str], str]] = []
|
||||
|
||||
def flush_current() -> None:
|
||||
nonlocal current_title, current_aliases
|
||||
nonlocal current_title, current_title_codes, current_aliases, current_variants
|
||||
if not current_title:
|
||||
return
|
||||
|
||||
aliases = sorted({alias.strip() for alias in current_aliases if alias.strip()})
|
||||
record_id = f"{file_stem}:{len(records) + 1}"
|
||||
device_type = infer_device_type(
|
||||
device_name=current_title,
|
||||
section=section,
|
||||
source_file=path.name,
|
||||
aliases=aliases,
|
||||
default_type=default_type,
|
||||
)
|
||||
records.append(
|
||||
DeviceRecord(
|
||||
id=record_id,
|
||||
device_name=current_title,
|
||||
brand=brand,
|
||||
manufacturer_brand=brand,
|
||||
parent_brand=resolve_parent_brand(brand),
|
||||
market_brand=infer_market_brand(
|
||||
manufacturer_brand=brand,
|
||||
device_name=current_title,
|
||||
section=section,
|
||||
aliases=aliases,
|
||||
),
|
||||
device_type=device_type,
|
||||
aliases=aliases,
|
||||
source_file=f"brands/{path.name}",
|
||||
split_groups = split_variant_groups(current_title, current_title_codes, current_variants)
|
||||
record_groups = split_groups or [(current_title, current_aliases)]
|
||||
for device_name, raw_aliases in record_groups:
|
||||
aliases = sorted({alias.strip() for alias in raw_aliases if alias.strip()})
|
||||
record_id = f"{file_stem}:{len(records) + 1}"
|
||||
device_type = infer_device_type(
|
||||
device_name=device_name,
|
||||
section=section,
|
||||
source_file=path.name,
|
||||
aliases=aliases,
|
||||
default_type=default_type,
|
||||
)
|
||||
records.append(
|
||||
DeviceRecord(
|
||||
id=record_id,
|
||||
device_name=device_name,
|
||||
brand=brand,
|
||||
manufacturer_brand=brand,
|
||||
parent_brand=resolve_parent_brand(brand),
|
||||
market_brand=infer_market_brand(
|
||||
manufacturer_brand=brand,
|
||||
device_name=device_name,
|
||||
section=section,
|
||||
aliases=aliases,
|
||||
),
|
||||
device_type=device_type,
|
||||
aliases=aliases,
|
||||
source_file=f"brands/{path.name}",
|
||||
section=section,
|
||||
)
|
||||
)
|
||||
)
|
||||
current_title = ""
|
||||
current_title_codes = []
|
||||
current_aliases = set()
|
||||
current_variants = []
|
||||
|
||||
for raw in lines:
|
||||
line = raw.strip()
|
||||
@@ -527,8 +568,10 @@ def parse_brand_file(path: Path) -> List[DeviceRecord]:
|
||||
flush_current()
|
||||
raw_title = entry_match.group(1).strip()
|
||||
current_title = clean_entry_title(raw_title)
|
||||
current_aliases = set(extract_codes(raw_title))
|
||||
current_title_codes = extract_codes(raw_title)
|
||||
current_aliases = set(current_title_codes)
|
||||
current_aliases.add(current_title)
|
||||
current_variants = []
|
||||
continue
|
||||
|
||||
if not current_title:
|
||||
@@ -538,6 +581,7 @@ def parse_brand_file(path: Path) -> List[DeviceRecord]:
|
||||
if variant_match:
|
||||
variant_codes = extract_codes(variant_match.group(1))
|
||||
variant_name = variant_match.group(2).strip()
|
||||
current_variants.append((variant_codes, variant_name))
|
||||
current_aliases.update(variant_codes)
|
||||
current_aliases.add(variant_name)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user