[General] CSV 及脚本移至新 repo
This commit is contained in:
1
.gitignore
vendored
1
.gitignore
vendored
@@ -1 +0,0 @@
|
|||||||
.idea
|
|
||||||
@@ -52,6 +52,9 @@
|
|||||||
|
|
||||||
## 项目历史
|
## 项目历史
|
||||||
|
|
||||||
|
### 2024 年 3 月
|
||||||
|
- 将本项目 csv 及脚本迁移至 [此 repo](https://github.com/KHwang9883/MobileModels-csv),使用 GitHub Actions 自动更新。
|
||||||
|
|
||||||
### 2022 年 4 月
|
### 2022 年 4 月
|
||||||
- 新增 [各大手机厂商 BL 解锁/内核开源情况](misc/bootloader-kernel-source.md) 汇总。
|
- 新增 [各大手机厂商 BL 解锁/内核开源情况](misc/bootloader-kernel-source.md) 汇总。
|
||||||
|
|
||||||
|
|||||||
@@ -1,298 +0,0 @@
|
|||||||
#!/usr/bin/python3
|
|
||||||
# -*- coding: utf-8 -*-
|
|
||||||
# File : mo_models.py
|
|
||||||
# Author: anyongjin
|
|
||||||
# Date : 2022/11/12
|
|
||||||
'''
|
|
||||||
将设备型号从MarkDown读取为CSV格式的脚本
|
|
||||||
输出列:设备编号,设备类型,品牌代码,品牌名,型号编码,型号昵称,型号名称,版本名称
|
|
||||||
'''
|
|
||||||
import os
|
|
||||||
import re
|
|
||||||
import traceback
|
|
||||||
import pandas as pd
|
|
||||||
from typing import Optional, List
|
|
||||||
from os.path import dirname, abspath
|
|
||||||
|
|
||||||
source_dir = os.path.join(dirname(dirname(abspath(__file__))), 'brands')
|
|
||||||
|
|
||||||
device_type: Optional[str] = None # 设备类型:手机,电视,手环
|
|
||||||
root_brand: Optional[str] = None # 品牌代码
|
|
||||||
root_brand_title: Optional[str] = None # 品牌名
|
|
||||||
devc_code: Optional[str] = None # 设备型号代码
|
|
||||||
devc_code_alias: Optional[str] = None # 设备型号昵称
|
|
||||||
devc_model_names: List[str] = [] # 设备型号正式名
|
|
||||||
|
|
||||||
|
|
||||||
_re_title = re.compile(r'^#+')
|
|
||||||
_re_blanks = re.compile(r'\s+')
|
|
||||||
_re_char = re.compile(r'([+]+|[^\W_])')
|
|
||||||
_re_word = re.compile(r'([a-zA-Z0-9]+|[^\W_]{,3})')
|
|
||||||
_re_non_word = re.compile(r'[\W_]+')
|
|
||||||
# 匹配model和版本的正则
|
|
||||||
_re_model_ver = re.compile(r'^`(([^`]+)`\s*)+:\s*')
|
|
||||||
_re_model_item = re.compile(r'`([^`]+)`')
|
|
||||||
# 匹配设备类型的正则
|
|
||||||
_re_device_type = re.compile(r'(手机|手表|手环|平板|电视主机|盒子|(智能)?电视|笔记本电脑|设备|Mobile|Phone|Pad|Pod|Tablet|Watch|Band|WATCH|Device|\bTV\b|学习智慧屏|智慧屏)')
|
|
||||||
_device_map = dict(
|
|
||||||
手机='mob',
|
|
||||||
mobile='mob',
|
|
||||||
phone='mob',
|
|
||||||
电视='tv',
|
|
||||||
智能电视='tv',
|
|
||||||
学习智慧屏='pad',
|
|
||||||
智慧屏='tv',
|
|
||||||
设备='device',
|
|
||||||
手表='watch',
|
|
||||||
手环='band',
|
|
||||||
Band='band',
|
|
||||||
笔记本电脑='computer',
|
|
||||||
tablet='pad',
|
|
||||||
平板='pad',
|
|
||||||
电视主机='tv_hub',
|
|
||||||
盒子='tv_hub'
|
|
||||||
)
|
|
||||||
|
|
||||||
pd_cols = 'model,dtype,brand,brand_title,code,code_alias,model_name,ver_name'.split(',')
|
|
||||||
pd_rows = []
|
|
||||||
|
|
||||||
|
|
||||||
def _process_h1(line: str):
|
|
||||||
# 设置设备类型,品牌名
|
|
||||||
global device_type, root_brand, root_brand_title
|
|
||||||
assert root_brand, 'root_brand is required'
|
|
||||||
# 替换无用描述词
|
|
||||||
line = re.sub(r'(Global|早期|国行)', '', line)
|
|
||||||
# 查找品牌结束位置
|
|
||||||
end_pos, device_type = _read_device_type(line)
|
|
||||||
brand_str = line[: end_pos]
|
|
||||||
# 只获取长度不小于2的有效单词
|
|
||||||
words = [mat.group() for mat in re.finditer(r'\w{2,}', brand_str) if len(mat.group()) >= 2]
|
|
||||||
if not words:
|
|
||||||
raise ValueError(f'no brand found in h1: {line}')
|
|
||||||
if len(words) == 1:
|
|
||||||
root_brand_title = words[0]
|
|
||||||
return
|
|
||||||
root_brand_title = root_brand
|
|
||||||
for w in words:
|
|
||||||
if root_brand.lower() == w.lower():
|
|
||||||
continue
|
|
||||||
root_brand_title = w
|
|
||||||
break
|
|
||||||
|
|
||||||
|
|
||||||
def _read_device_type(line: str, raise_err: bool = True):
|
|
||||||
type_mat = _re_device_type.search(line)
|
|
||||||
if not type_mat:
|
|
||||||
if raise_err:
|
|
||||||
raise ValueError(f'unknown h1 format: {line}')
|
|
||||||
else:
|
|
||||||
return -1, None
|
|
||||||
dtype = type_mat.group().lower()
|
|
||||||
dtype = _device_map.get(dtype, dtype)
|
|
||||||
return type_mat.start(), dtype
|
|
||||||
|
|
||||||
|
|
||||||
def _process_bold_model(line: str):
|
|
||||||
'''
|
|
||||||
处理加粗的设备型号行
|
|
||||||
:param line:
|
|
||||||
:return:
|
|
||||||
'''
|
|
||||||
global device_type, devc_code, devc_code_alias, devc_model_names
|
|
||||||
_reset_context('code')
|
|
||||||
code_mat = re.search(r'\[\`([^`]+)\`\]', line)
|
|
||||||
code_nmat = re.search(r'\(\`([^`]+)\`\)', line)
|
|
||||||
md_start, md_end = 0, len(line)
|
|
||||||
if code_mat:
|
|
||||||
devc_code = code_mat.group(1)
|
|
||||||
md_start = code_mat.end()
|
|
||||||
if code_nmat:
|
|
||||||
devc_code_alias = code_nmat.group(1)
|
|
||||||
md_end = code_nmat.start()
|
|
||||||
model_name = _strip_text(line[md_start: md_end])
|
|
||||||
# 检查设备类型是否变化
|
|
||||||
dtype = _read_device_type(model_name, False)[1]
|
|
||||||
if dtype and dtype != device_type:
|
|
||||||
device_type = dtype
|
|
||||||
# 检查是否一行有多个品牌,以/分割
|
|
||||||
model_names = _try_split_by_splash(model_name)
|
|
||||||
model_names = [_strip_text(mname) for mname in model_names]
|
|
||||||
# 检查是否包含品牌,包含则去除
|
|
||||||
devc_model_names = []
|
|
||||||
for mname in model_names:
|
|
||||||
brand_start = mname.find(root_brand)
|
|
||||||
if brand_start >= 0:
|
|
||||||
# 型号包含品牌名,去除
|
|
||||||
mname = _strip_text(mname[brand_start + len(root_brand):])
|
|
||||||
dtype_mat = _re_device_type.search(mname)
|
|
||||||
if dtype_mat:
|
|
||||||
mname = _strip_text(mname[dtype_mat.end():])
|
|
||||||
devc_model_names.append(mname)
|
|
||||||
|
|
||||||
|
|
||||||
def _get_ver_name_with_model(ver_full: str, model_name: str):
|
|
||||||
'''
|
|
||||||
从最精细的版本中去除型号信息。可能不完全包含版本名称,而是包含版本的一部分
|
|
||||||
:param ver_full:
|
|
||||||
:param model_name:
|
|
||||||
:return:
|
|
||||||
'''
|
|
||||||
ver_words = _re_char.finditer(ver_full)
|
|
||||||
model_first_word = _re_word.search(model_name).group().lower()
|
|
||||||
ver_start = ver_full.lower().find(model_first_word)
|
|
||||||
if ver_start < 0:
|
|
||||||
return ver_full
|
|
||||||
model_chars = [mat.group() for mat in _re_char.finditer(model_name)]
|
|
||||||
model_idx = 0
|
|
||||||
for ver_mat in ver_words:
|
|
||||||
if ver_mat.start() < ver_start:
|
|
||||||
continue
|
|
||||||
if model_idx >= len(model_chars):
|
|
||||||
return '#' + _strip_text(ver_full[ver_mat.start():])
|
|
||||||
ver_word = ver_mat.group()
|
|
||||||
md_word = model_chars[model_idx]
|
|
||||||
if ver_word.lower() == md_word.lower():
|
|
||||||
model_idx += 1
|
|
||||||
continue
|
|
||||||
clean_ver = _strip_text(ver_full[ver_mat.start():])
|
|
||||||
return '#' + clean_ver
|
|
||||||
return '#'
|
|
||||||
|
|
||||||
|
|
||||||
def _strip_text(text: str):
|
|
||||||
# 去除头部无效字符
|
|
||||||
start = _re_char.search(text)
|
|
||||||
if not start:
|
|
||||||
return ''
|
|
||||||
text = text[start.start():]
|
|
||||||
# 去除尾部无效字符
|
|
||||||
end_pos = len(text) - _re_char.search(text[::-1]).start()
|
|
||||||
clean_text = text[:end_pos]
|
|
||||||
# 补全缺失的括号
|
|
||||||
brackets, prepend, appends = [], [], []
|
|
||||||
brac_map = {'(': ')', '(': ')', ')': '(', ')': '('}
|
|
||||||
for c in clean_text:
|
|
||||||
if c in {'(', '('}:
|
|
||||||
btype = 1
|
|
||||||
elif c in {')', ')'}:
|
|
||||||
btype = 2
|
|
||||||
else:
|
|
||||||
continue
|
|
||||||
if btype == 1:
|
|
||||||
brackets.append(c)
|
|
||||||
elif len(brackets) > 0:
|
|
||||||
brackets.pop()
|
|
||||||
else:
|
|
||||||
prepend.append(brac_map[c])
|
|
||||||
for brac in brackets:
|
|
||||||
appends.append(brac_map[brac])
|
|
||||||
return ''.join([*prepend, clean_text, *appends])
|
|
||||||
|
|
||||||
|
|
||||||
def _get_ver_name(ver_full: str):
|
|
||||||
ver_names, last_err = [], None
|
|
||||||
for i, mname in enumerate(devc_model_names):
|
|
||||||
try:
|
|
||||||
ver_names.append((i, _get_ver_name_with_model(ver_full, mname)))
|
|
||||||
except ValueError as e:
|
|
||||||
last_err = e
|
|
||||||
if not ver_names:
|
|
||||||
raise last_err
|
|
||||||
ver_item = sorted(ver_names, key=lambda x: len(x[1]))[0]
|
|
||||||
return ver_item[1] if not ver_item[0] else f'{ver_item[0]}{ver_item[1]}'
|
|
||||||
|
|
||||||
|
|
||||||
def _try_split_by_splash(type_name: str):
|
|
||||||
# 检查是否是/分割的多个版本。多个版本一般前几个单词相同
|
|
||||||
ver_full_names = [vname.strip() for vname in type_name.split('/')]
|
|
||||||
if len(ver_full_names) > 1:
|
|
||||||
name1_arr = _re_non_word.split(ver_full_names[0])
|
|
||||||
name2_arr = _re_non_word.split(ver_full_names[1])
|
|
||||||
if name1_arr[0] != name2_arr[0]:
|
|
||||||
# 首个单词不同,不认为是多个版本
|
|
||||||
return [type_name]
|
|
||||||
return ver_full_names
|
|
||||||
|
|
||||||
|
|
||||||
def _process_model_ver(line: str, mat: re.Match):
|
|
||||||
global device_type, root_brand, root_brand_title, devc_code, devc_code_alias, devc_model_names
|
|
||||||
model_text = mat.group()
|
|
||||||
models = [m.group(1) for m in _re_model_item.finditer(model_text)]
|
|
||||||
ver_full = _strip_text(line[mat.end():])
|
|
||||||
ver_full_names = _try_split_by_splash(ver_full)
|
|
||||||
for full_name in ver_full_names:
|
|
||||||
ver_name = _get_ver_name(full_name)
|
|
||||||
for model in models:
|
|
||||||
pd_rows.append((model, device_type, root_brand, root_brand_title, devc_code, devc_code_alias,
|
|
||||||
'|'.join(devc_model_names), ver_name))
|
|
||||||
|
|
||||||
|
|
||||||
def _process_line(line: str):
|
|
||||||
global device_type
|
|
||||||
if line.startswith('-'):
|
|
||||||
return
|
|
||||||
title_mat = _re_title.search(line)
|
|
||||||
title_level = len(title_mat.group(0)) if title_mat else 0
|
|
||||||
pure_line = line[title_level:].strip()
|
|
||||||
if title_level == 1:
|
|
||||||
_process_h1(pure_line)
|
|
||||||
elif title_level == 2:
|
|
||||||
dtype = _read_device_type(pure_line, False)[1]
|
|
||||||
if dtype:
|
|
||||||
device_type = dtype
|
|
||||||
# 系列,子品牌,不同产品类型
|
|
||||||
return
|
|
||||||
elif title_level:
|
|
||||||
raise ValueError(f'unknown title type: {title_level}, {line}')
|
|
||||||
elif pure_line.startswith('**') and pure_line.endswith('**'):
|
|
||||||
_process_bold_model(pure_line[2: -2])
|
|
||||||
elif detail_mat := _re_model_ver.search(pure_line):
|
|
||||||
_process_model_ver(pure_line, detail_mat)
|
|
||||||
else:
|
|
||||||
raise ValueError(f'unknown line: {line}')
|
|
||||||
|
|
||||||
|
|
||||||
def _reset_context(level: str):
|
|
||||||
'''
|
|
||||||
重置上下文: brand, code
|
|
||||||
:param level:
|
|
||||||
:return:
|
|
||||||
'''
|
|
||||||
global device_type, root_brand_title, devc_code, devc_code_alias, devc_model_names
|
|
||||||
if level == 'brand' or level == 'all':
|
|
||||||
device_type = None
|
|
||||||
root_brand_title = None
|
|
||||||
if level == 'code' or level == 'all':
|
|
||||||
devc_code = None
|
|
||||||
devc_code_alias = None
|
|
||||||
devc_model_names = []
|
|
||||||
|
|
||||||
|
|
||||||
def sync_brands(name: str):
|
|
||||||
global root_brand
|
|
||||||
_reset_context('all')
|
|
||||||
root_brand = re.split(r'[\W_]+', name)[0].replace('shouji', '')
|
|
||||||
full_path = os.path.join(source_dir, name)
|
|
||||||
with open(full_path, 'r', encoding='utf-8') as fdata:
|
|
||||||
for line in fdata:
|
|
||||||
try:
|
|
||||||
line = line.strip()
|
|
||||||
if not line:
|
|
||||||
continue
|
|
||||||
_process_line(line)
|
|
||||||
except Exception as e:
|
|
||||||
print(f'exception process {root_brand}: {e}')
|
|
||||||
traceback.print_exc()
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
fnames = os.listdir(source_dir)
|
|
||||||
for name in fnames:
|
|
||||||
# if name.endswith('_en.md'):
|
|
||||||
# continue
|
|
||||||
print(f'process: {name}')
|
|
||||||
sync_brands(name)
|
|
||||||
df = pd.DataFrame(pd_rows, columns=pd_cols)
|
|
||||||
df.to_csv('./models.csv', index=False)
|
|
||||||
print('generate complete, out file: ./models.csv')
|
|
||||||
7405
scripts/models.csv
7405
scripts/models.csv
File diff suppressed because it is too large
Load Diff
@@ -1,28 +0,0 @@
|
|||||||
# 说明
|
|
||||||
这是用于将markdown格式的设备型号介绍转为csv格式的python脚本。
|
|
||||||
输出列:设备编号,设备类型,品牌代码,品牌名,型号编码,型号昵称,型号名称,版本名称
|
|
||||||
|
|
||||||
**设备编号(model)**
|
|
||||||
能从浏览器UserAgent中获取到的设备编号,如华为P40对应"ANA-AL00"
|
|
||||||
一个model可能对应多个版本,也可能多个model对应一个版本。
|
|
||||||
**设备类型(device_type)**
|
|
||||||
包含:手机、手表、平板、电视、电视盒子、笔记本、pod等
|
|
||||||
对应csv的值:mob,watch,pad,tv,tv_hub,computer,pod
|
|
||||||
会从一级标题、二级标题、加粗行中尝试提取。如果一级标题不存在则置为空,如果有多个二级标题,后面的二级标题(或加粗行)未检测到有效设备类型,会使用前面的设备类型。
|
|
||||||
**品牌代码(brand)**
|
|
||||||
从brands目录下的文件名中提取第一个单词
|
|
||||||
**品牌名(brand_title)**
|
|
||||||
从一级标题中按正则提取
|
|
||||||
**型号编码(code)**
|
|
||||||
从加粗行的前面中括号中提取
|
|
||||||
**型号昵称(code_alias)**
|
|
||||||
从加粗行的尾部小括号中提取
|
|
||||||
**型号名称(model_name)**
|
|
||||||
从加粗行去掉code和code_alias后剩余的内容
|
|
||||||
注意:一行可能有多个型号名称,以"/"分割
|
|
||||||
**版本名称(ver_name)**
|
|
||||||
从model行,提取冒号之后的内容,再去掉model_name的重合部分,只保留版本信息
|
|
||||||
有些版本名称可能没有完全包含model_name,而是只包含其中一部分,还有些可能完全没包含model_name
|
|
||||||
输出ver_name中,如果包含model_name,然后去掉了一部分,则规定以"#"开头。
|
|
||||||
如果有多个model_name,且包含的不是第1个,则"#"前面会添加索引
|
|
||||||
例如"CPH2413"对应"一加 10T 印度版",但所属的型号是"一加 Ace Pro / 一加 10T",有两个,和第二个型号相同,所以版本中去掉相同的"一加 10T"部分,变成"#印度版",又因为对应的型号是第二个,索引是1,故最后是"1#印度版"
|
|
||||||
Reference in New Issue
Block a user