Files
MobileModels/tools/device_mapper.py
2026-03-19 17:34:45 +08:00

760 lines
24 KiB
Python

#!/usr/bin/env python3
"""Build and query a cross-platform device mapping index from MobileModels markdown data."""
from __future__ import annotations
import argparse
from collections import Counter
import json
import re
from dataclasses import asdict, dataclass
from datetime import date
from pathlib import Path
from typing import Dict, Iterable, List, Optional, Set
from project_layout import PROJECT_ROOT, WORKSPACE_ROOT
ENTRY_RE = re.compile(r"^\*\*(.+?)\*\*\s*$")
VARIANT_RE = re.compile(r"^\s*((?:`[^`]+`\s*)+):\s*(.+?)\s*$")
BACKTICK_RE = re.compile(r"`([^`]+)`")
SECTION_RE = re.compile(r"^##\s+(.+?)\s*$")
FILE_BRAND_MAP: Dict[str, str] = {
"360shouji": "360",
"apple_all": "Apple",
"apple_all_en": "Apple",
"apple_cn": "Apple",
"asus_cn": "ASUS",
"asus_en": "ASUS",
"blackshark": "Black Shark",
"blackshark_en": "Black Shark",
"coolpad": "Coolpad",
"google": "Google",
"honor_cn": "HONOR",
"honor_global_en": "HONOR",
"huawei_cn": "HUAWEI",
"huawei_global_en": "HUAWEI",
"lenovo_cn": "Lenovo",
"letv": "LeTV",
"meizu": "Meizu",
"meizu_en": "Meizu",
"mitv_cn": "Xiaomi",
"mitv_global_en": "Xiaomi",
"motorola_cn": "Motorola",
"nokia_cn": "Nokia",
"nothing": "Nothing",
"nubia": "nubia",
"oneplus": "OnePlus",
"oneplus_en": "OnePlus",
"oppo_cn": "OPPO",
"oppo_global_en": "OPPO",
"realme_cn": "realme",
"realme_global_en": "realme",
"samsung_cn": "Samsung",
"samsung_global_en": "Samsung",
"smartisan": "Smartisan",
"sony": "Sony",
"sony_cn": "Sony",
"vivo_cn": "vivo",
"vivo_global_en": "vivo",
"xiaomi": "Xiaomi",
"xiaomi_cn": "Xiaomi",
"xiaomi_en": "Xiaomi",
"xiaomi-wear": "Xiaomi",
"zhixuan": "HUAWEI Smart Selection",
"zte_cn": "ZTE",
}
FILE_DEFAULT_DEVICE_TYPE: Dict[str, str] = {
"mitv_cn": "tv",
"mitv_global_en": "tv",
"xiaomi-wear": "wear",
"apple_all": "phone",
"apple_all_en": "phone",
"apple_cn": "phone",
"google": "phone",
"honor_cn": "phone",
"honor_global_en": "phone",
"huawei_cn": "phone",
"huawei_global_en": "phone",
"xiaomi": "phone",
"xiaomi_cn": "phone",
"xiaomi_en": "phone",
"zhixuan": "phone",
}
BRAND_ALIASES: Dict[str, List[str]] = {
"360": ["360", "360手机", "奇酷", "qiku"],
"Apple": ["apple", "苹果", "iphone", "ipad", "ipod"],
"ASUS": ["asus", "华硕", "rog", "zenfone"],
"Black Shark": ["black shark", "blackshark", "黑鲨"],
"Coolpad": ["coolpad", "酷派"],
"Google": ["google", "pixel"],
"HONOR": ["honor", "荣耀"],
"HUAWEI": ["huawei", "华为"],
"HUAWEI Smart Selection": ["华为智选", "zhixuan", "umagic", "wiko", "hi nova", "nzone"],
"Lenovo": ["lenovo", "联想", "zuk", "拯救者"],
"LeTV": ["letv", "乐视"],
"Meizu": ["meizu", "魅族"],
"Motorola": ["motorola", "摩托罗拉", "moto"],
"Nokia": ["nokia", "诺基亚"],
"Nothing": ["nothing", "cmf"],
"nubia": ["nubia", "努比亚", "红魔", "redmagic"],
"iQOO": ["iqoo", "i qoo", "艾酷"],
"OnePlus": ["oneplus", "一加"],
"OPPO": ["oppo"],
"POCO": ["poco"],
"Redmi": ["redmi", "红米", "hongmi"],
"realme": ["realme", "真我"],
"Samsung": ["samsung", "三星", "galaxy"],
"Smartisan": ["smartisan", "锤子", "坚果"],
"Sony": ["sony", "索尼", "xperia"],
"vivo": ["vivo"],
"Xiaomi": ["xiaomi", "小米", "mi", "米家", "mipad"],
"ZTE": ["zte", "中兴"],
}
MANUFACTURER_PARENT_BRAND: Dict[str, str] = {
"Black Shark": "Xiaomi",
"HUAWEI Smart Selection": "HUAWEI",
"Motorola": "Lenovo",
"iQOO": "vivo",
"POCO": "Xiaomi",
"Redmi": "Xiaomi",
"OnePlus": "OPPO",
"realme": "OPPO",
"nubia": "ZTE",
}
MARKET_BRAND_ALIASES: Dict[str, List[str]] = {
"iQOO": ["iqoo", "i qoo", "艾酷"],
"POCO": ["poco"],
"Redmi": ["redmi", "红米", "hongmi"],
"Xiaomi": ["xiaomi", "小米", "mi", "mipad", "米家"],
}
MARKET_BRAND_TO_MANUFACTURER: Dict[str, str] = {
"iQOO": "vivo",
"POCO": "Xiaomi",
"Redmi": "Xiaomi",
"Xiaomi": "Xiaomi",
}
TV_KEYWORDS = [
"tv",
"电视",
"智慧屏",
"smart tv",
"机顶盒",
"tv box",
"stick",
"dongle",
]
TABLET_KEYWORDS = [
"ipad",
"tablet",
"tab",
"pad",
"平板",
"matepad",
]
WEAR_KEYWORDS = [
"watch",
"smartwatch",
"手表",
"手环",
"band",
"wear",
"wearable",
"buds",
"earbuds",
"耳机",
"tws",
"eyewear",
"glasses",
"眼镜",
]
OTHER_KEYWORDS = [
"matebook",
"笔记本",
"laptop",
"notebook",
"vision",
"vr",
"ipod",
"airpods",
]
PHONE_KEYWORDS = [
"iphone",
"phone",
"手机",
"galaxy",
"pixel",
"xiaomi",
"redmi",
"poco",
"honor",
"huawei",
"mate",
"nova",
"oppo",
"vivo",
"realme",
"oneplus",
"nokia",
"nubia",
"meizu",
"lenovo",
"motorola",
"zte",
"smartisan",
"zenfone",
"rog",
"麦芒",
"畅享",
"优畅享",
]
@dataclass
class DeviceRecord:
id: str
device_name: str
brand: str
manufacturer_brand: str
parent_brand: str
market_brand: str
device_type: str
aliases: List[str]
source_file: str
section: str
def normalize_text(text: str) -> str:
return re.sub(r"[^0-9a-z\u4e00-\u9fff]+", "", text.lower())
def canonical_brand(file_stem: str) -> str:
return FILE_BRAND_MAP.get(file_stem, file_stem)
def brand_aliases(brand: str) -> List[str]:
aliases = set(BRAND_ALIASES.get(brand, []))
aliases.add(brand)
return sorted(aliases)
def has_keyword(text: str, keywords: Iterable[str]) -> bool:
norm_text = normalize_text(text)
for kw in keywords:
if normalize_text(kw) and normalize_text(kw) in norm_text:
return True
return False
def resolve_parent_brand(manufacturer_brand: str) -> str:
return MANUFACTURER_PARENT_BRAND.get(manufacturer_brand, manufacturer_brand)
def infer_market_brand(
manufacturer_brand: str,
device_name: str,
section: str,
aliases: Iterable[str],
) -> str:
corpus = normalize_text(" ".join([device_name, section, *aliases]))
if manufacturer_brand == "Xiaomi":
poco_keys = [normalize_text(v) for v in MARKET_BRAND_ALIASES["POCO"]]
redmi_keys = [normalize_text(v) for v in MARKET_BRAND_ALIASES["Redmi"]]
if any(key and key in corpus for key in poco_keys):
return "POCO"
if any(key and key in corpus for key in redmi_keys):
return "Redmi"
return "Xiaomi"
if manufacturer_brand == "vivo":
iqoo_keys = [normalize_text(v) for v in MARKET_BRAND_ALIASES["iQOO"]]
if any(key and key in corpus for key in iqoo_keys):
return "iQOO"
return "vivo"
return manufacturer_brand
def infer_device_type(
device_name: str,
section: str,
source_file: str,
aliases: Iterable[str],
default_type: str,
) -> str:
corpus = " ".join([device_name, section, *aliases, source_file])
if has_keyword(corpus, TV_KEYWORDS):
return "tv"
if has_keyword(corpus, TABLET_KEYWORDS):
return "tablet"
if has_keyword(corpus, WEAR_KEYWORDS):
return "wear"
if has_keyword(corpus, OTHER_KEYWORDS):
return "other"
if has_keyword(corpus, PHONE_KEYWORDS):
return "phone"
return default_type or "other"
def clean_entry_title(raw_title: str) -> str:
title = raw_title.strip()
if title.endswith(":"):
title = title[:-1].strip()
# remove leading tag like: [`X1`] or [X1]
title = re.sub(r"^\[[^\]]+\]\s*", "", title)
# remove one or more trailing codenames like: (`foo`) (`bar`)
title = re.sub(r"(?:\s*\(\s*`[^`]+`\s*\))+\s*$", "", title)
title = re.sub(r"\s*\((?:codename|代号)[^)]*\)\s*$", "", title, flags=re.IGNORECASE)
# strip markdown links while keeping text: [Foo](url) -> Foo
title = re.sub(r"\[([^\]]+)\]\([^)]*\)", r"\1", title)
title = " ".join(title.split())
return title
def extract_codes(text: str) -> List[str]:
return [code.strip() for code in BACKTICK_RE.findall(text) if code.strip()]
def parse_brand_file(path: Path) -> List[DeviceRecord]:
file_stem = path.stem
brand = canonical_brand(file_stem)
default_type = FILE_DEFAULT_DEVICE_TYPE.get(file_stem, "phone")
records: List[DeviceRecord] = []
lines = path.read_text(encoding="utf-8").splitlines()
section = ""
current_title = ""
current_aliases: Set[str] = set()
def flush_current() -> None:
nonlocal current_title, current_aliases
if not current_title:
return
aliases = sorted({alias.strip() for alias in current_aliases if alias.strip()})
record_id = f"{file_stem}:{len(records) + 1}"
device_type = infer_device_type(
device_name=current_title,
section=section,
source_file=path.name,
aliases=aliases,
default_type=default_type,
)
records.append(
DeviceRecord(
id=record_id,
device_name=current_title,
brand=brand,
manufacturer_brand=brand,
parent_brand=resolve_parent_brand(brand),
market_brand=infer_market_brand(
manufacturer_brand=brand,
device_name=current_title,
section=section,
aliases=aliases,
),
device_type=device_type,
aliases=aliases,
source_file=f"brands/{path.name}",
section=section,
)
)
current_title = ""
current_aliases = set()
for raw in lines:
line = raw.strip()
if not line:
continue
section_match = SECTION_RE.match(line)
if section_match:
section = section_match.group(1).strip()
continue
entry_match = ENTRY_RE.match(line)
if entry_match:
flush_current()
raw_title = entry_match.group(1).strip()
current_title = clean_entry_title(raw_title)
current_aliases = set(extract_codes(raw_title))
current_aliases.add(current_title)
continue
if not current_title:
continue
variant_match = VARIANT_RE.match(line)
if variant_match:
variant_codes = extract_codes(variant_match.group(1))
variant_name = variant_match.group(2).strip()
current_aliases.update(variant_codes)
current_aliases.add(variant_name)
flush_current()
return records
class DeviceMapper:
def __init__(self, records: List[DeviceRecord]) -> None:
self.records = records
self.records_by_id = {record.id: record for record in records}
self.manufacturer_alias_lookup: Dict[str, str] = {}
self.parent_alias_lookup: Dict[str, str] = {}
self.market_alias_lookup: Dict[str, str] = {}
self.parent_to_children: Dict[str, Set[str]] = {}
self.alias_index: Dict[str, Set[str]] = {}
for record in records:
for alias in record.aliases:
key = normalize_text(alias)
if not key:
continue
self.alias_index.setdefault(key, set()).add(record.id)
manufacturers = sorted({record.manufacturer_brand for record in records})
parents = sorted({record.parent_brand for record in records})
for brand in manufacturers:
for alias in brand_aliases(brand):
key = normalize_text(alias)
if key:
self.manufacturer_alias_lookup[key] = brand
for parent in parents:
for alias in brand_aliases(parent):
key = normalize_text(alias)
if key:
self.parent_alias_lookup[key] = parent
for manufacturer in manufacturers:
parent = resolve_parent_brand(manufacturer)
self.parent_to_children.setdefault(parent, set()).add(manufacturer)
for market_brand, aliases in MARKET_BRAND_ALIASES.items():
for alias in set([market_brand, *aliases]):
key = normalize_text(alias)
if key:
self.market_alias_lookup[key] = market_brand
def _parse_brand_filter(self, input_brand: Optional[str]) -> Dict[str, Optional[str]]:
if not input_brand:
return {
"parent_brand": None,
"manufacturer_brand": None,
"market_brand": None,
"source": "none",
}
input_norm = normalize_text(input_brand)
if not input_norm:
return {
"parent_brand": None,
"manufacturer_brand": None,
"market_brand": None,
"source": "none",
}
if input_norm in self.market_alias_lookup:
market_brand = self.market_alias_lookup[input_norm]
manufacturer_brand = MARKET_BRAND_TO_MANUFACTURER.get(market_brand, market_brand)
parent_brand = resolve_parent_brand(manufacturer_brand)
if market_brand == "Xiaomi":
return {
"parent_brand": parent_brand,
"manufacturer_brand": manufacturer_brand,
"market_brand": None,
"source": "manufacturer_alias_from_market",
}
return {
"parent_brand": parent_brand,
"manufacturer_brand": manufacturer_brand,
"market_brand": market_brand,
"source": "market_alias_exact",
}
if input_norm in self.manufacturer_alias_lookup:
manufacturer_brand = self.manufacturer_alias_lookup[input_norm]
parent_brand = resolve_parent_brand(manufacturer_brand)
children = self.parent_to_children.get(manufacturer_brand, set())
if manufacturer_brand == parent_brand and len(children) > 1:
return {
"parent_brand": parent_brand,
"manufacturer_brand": None,
"market_brand": None,
"source": "parent_alias_exact",
}
return {
"parent_brand": parent_brand,
"manufacturer_brand": manufacturer_brand,
"market_brand": None,
"source": "manufacturer_alias_exact",
}
if input_norm in self.parent_alias_lookup:
parent_brand = self.parent_alias_lookup[input_norm]
return {
"parent_brand": parent_brand,
"manufacturer_brand": None,
"market_brand": None,
"source": "parent_alias_exact",
}
for alias_norm, market_brand in self.market_alias_lookup.items():
if alias_norm and alias_norm in input_norm:
manufacturer_brand = MARKET_BRAND_TO_MANUFACTURER.get(market_brand, market_brand)
return {
"parent_brand": resolve_parent_brand(manufacturer_brand),
"manufacturer_brand": manufacturer_brand,
"market_brand": market_brand,
"source": "market_alias_contains",
}
for alias_norm, manufacturer_brand in self.manufacturer_alias_lookup.items():
if alias_norm and alias_norm in input_norm:
parent_brand = resolve_parent_brand(manufacturer_brand)
children = self.parent_to_children.get(manufacturer_brand, set())
if manufacturer_brand == parent_brand and len(children) > 1:
return {
"parent_brand": parent_brand,
"manufacturer_brand": None,
"market_brand": None,
"source": "parent_alias_contains",
}
return {
"parent_brand": parent_brand,
"manufacturer_brand": manufacturer_brand,
"market_brand": None,
"source": "manufacturer_alias_contains",
}
for alias_norm, parent_brand in self.parent_alias_lookup.items():
if alias_norm and alias_norm in input_norm:
return {
"parent_brand": parent_brand,
"manufacturer_brand": None,
"market_brand": None,
"source": "parent_alias_contains",
}
return {
"parent_brand": None,
"manufacturer_brand": None,
"market_brand": None,
"source": "none",
}
@staticmethod
def _brand_match(
brand_filter: Dict[str, Optional[str]],
record: DeviceRecord,
) -> bool:
parent = brand_filter.get("parent_brand")
manufacturer = brand_filter.get("manufacturer_brand")
market = brand_filter.get("market_brand")
if parent and record.parent_brand != parent:
return False
if manufacturer and record.manufacturer_brand != manufacturer:
return False
if market and record.market_brand != market:
return False
return True
def find(self, name: str, brand: Optional[str] = None, limit: int = 5) -> Dict[str, object]:
query = normalize_text(name)
if not query:
return {
"matched": False,
"reason": "Empty device name.",
"query_name": name,
"query_brand": brand,
"candidates": [],
}
candidate_ids = list(self.alias_index.get(query, set()))
matched_records = [self.records_by_id[rid] for rid in candidate_ids]
brand_filter = self._parse_brand_filter(brand)
if brand:
matched_records = [r for r in matched_records if self._brand_match(brand_filter, r)]
if not matched_records and brand_filter.get("manufacturer_brand"):
fallback_filter = {
"parent_brand": brand_filter.get("parent_brand"),
"manufacturer_brand": brand_filter.get("manufacturer_brand"),
"market_brand": None,
}
matched_records = [r for r in [self.records_by_id[rid] for rid in candidate_ids] if self._brand_match(fallback_filter, r)]
matched_records.sort(key=lambda r: (r.device_name, r.source_file, r.id))
if matched_records:
best = matched_records[0]
return {
"matched": True,
"query_name": name,
"query_brand": brand,
"query_brand_parsed": brand_filter,
"best": asdict(best),
"candidates": [asdict(r) for r in matched_records[:limit]],
}
suggestions: List[str] = []
for alias in self.alias_index:
if query in alias or alias in query:
suggestions.append(alias)
if len(suggestions) >= limit:
break
return {
"matched": False,
"query_name": name,
"query_brand": brand,
"query_brand_parsed": brand_filter,
"reason": "No exact alias match.",
"candidates": [],
"suggestions": suggestions,
}
def build_records(repo_root: Path) -> List[DeviceRecord]:
brands_dir = repo_root / "brands"
records: List[DeviceRecord] = []
for md_path in sorted(brands_dir.glob("*.md")):
records.extend(parse_brand_file(md_path))
return records
def export_index(records: List[DeviceRecord], output_path: Path) -> None:
lookup: Dict[str, List[str]] = {}
manufacturer_brands_in_data = sorted({record.manufacturer_brand for record in records})
parent_brands_in_data = sorted({record.parent_brand for record in records})
market_brands_in_data = sorted({record.market_brand for record in records})
all_brands_in_data = sorted(
set(manufacturer_brands_in_data)
| set(market_brands_in_data)
| set(MARKET_BRAND_TO_MANUFACTURER.keys())
)
manufacturer_stats = dict(sorted(Counter(record.manufacturer_brand for record in records).items()))
parent_stats = dict(sorted(Counter(record.parent_brand for record in records).items()))
market_brand_stats = dict(sorted(Counter(record.market_brand for record in records).items()))
brand_to_manufacturer = {}
for brand in all_brands_in_data:
if brand in MARKET_BRAND_TO_MANUFACTURER:
brand_to_manufacturer[brand] = MARKET_BRAND_TO_MANUFACTURER[brand]
else:
brand_to_manufacturer[brand] = resolve_parent_brand(brand)
parent_to_children: Dict[str, List[str]] = {}
for child, parent in brand_to_manufacturer.items():
parent_to_children.setdefault(parent, []).append(child)
for parent in parent_to_children:
parent_to_children[parent] = sorted(parent_to_children[parent])
all_aliases = {brand: brand_aliases(brand) for brand in all_brands_in_data}
for record in records:
for alias in record.aliases:
key = normalize_text(alias)
if not key:
continue
lookup.setdefault(key, []).append(record.id)
for key, ids in lookup.items():
lookup[key] = sorted(set(ids))
output = {
"generated_on": date.today().isoformat(),
"total_records": len(records),
"brands": manufacturer_brands_in_data,
"brand_aliases": all_aliases,
"brand_management": {
"brands": all_brands_in_data,
"manufacturers": sorted(set(brand_to_manufacturer.values())),
"manufacturer_aliases": all_aliases,
"manufacturer_to_parent": brand_to_manufacturer,
"brand_to_manufacturer": brand_to_manufacturer,
"parent_to_children": parent_to_children,
"parent_aliases": {brand: brand_aliases(brand) for brand in parent_brands_in_data},
"market_brand_aliases": MARKET_BRAND_ALIASES,
"market_brand_to_manufacturer": MARKET_BRAND_TO_MANUFACTURER,
"market_brands": market_brands_in_data,
"parent_brands": parent_brands_in_data,
"stats": {
"manufacturer_brand": manufacturer_stats,
"parent_brand": parent_stats,
"market_brand": market_brand_stats,
},
},
"lookup": lookup,
"records": [asdict(r) for r in records],
}
output_path.parent.mkdir(parents=True, exist_ok=True)
output_path.write_text(json.dumps(output, ensure_ascii=False, indent=2), encoding="utf-8")
def main() -> None:
parser = argparse.ArgumentParser(description="MobileModels device mapper")
parser.add_argument(
"--repo-root",
type=Path,
default=WORKSPACE_ROOT,
help="Path to workspace root",
)
subparsers = parser.add_subparsers(dest="command", required=True)
build_cmd = subparsers.add_parser("build", help="Build JSON index")
build_cmd.add_argument(
"--output",
type=Path,
default=Path("dist/device_index.json"),
help="Output JSON path",
)
find_cmd = subparsers.add_parser("find", help="Find a device by name + optional brand")
find_cmd.add_argument("--name", required=True, help="Raw device name from app")
find_cmd.add_argument("--brand", default=None, help="Optional raw brand from app")
find_cmd.add_argument("--limit", type=int, default=5, help="Max matched candidates")
args = parser.parse_args()
records = build_records(args.repo_root)
mapper = DeviceMapper(records)
if args.command == "build":
output_path: Path = args.output
if not output_path.is_absolute():
output_path = PROJECT_ROOT / output_path
export_index(records, output_path)
print(f"Built index: {output_path}")
print(f"Total records: {len(records)}")
return
if args.command == "find":
result = mapper.find(name=args.name, brand=args.brand, limit=args.limit)
print(json.dumps(result, ensure_ascii=False, indent=2))
return
if __name__ == "__main__":
main()