在优化博客性能时,字体往往是一个被忽略却体积巨大的资源。以思源宋体为例,完整字体动辄十几 MB,即使分包后依然不小。对于以文字为主的博客来说,这是完全可以进一步优化的。
结果:
| 文件名 | 类型 | 大小 |
|---|---|---|
| core-misc.woff2 | woff2 | 39.68 kB |
| core-cjk.woff2 | woff2 | 141.33 kB |
| fallback.woff2 | woff2 | 96.26 kB |
思路:
- 从网站备份中提取高频字符(或者使用3500常用字1)
- 基于提取结果裁剪字体,生成子集
[!NOTE]
不套用“3500的常用字”,而是统计真正用过的,这样生成的字体子集,既保证文章正常显示,又最大程度缩小体积。
1. 从备份中提取常用字
通过脚本解析网站备份文件:
- 统计字符频率
- 提取前 800 个高频字
- 同时收集全部出现过的 CJK 字符
生成两个字符集文件: charset-core.txt、charset-all-cjk.txt
2. 生成字体子集
利用 fonttools 对原始思源宋体进行裁剪输出:
core-cjk.woff2(高频汉字)core-misc.woff2(标点、符号、ASCII)fallback.woff2(其余 CJK 字符)subset-font.css(自动生成的 font-face 配置)
3. 脚本参考
提取
param(
[Parameter(Mandatory = $true)]
[string]$InputFile,
[int]$TopChars = 4000,
[string]$OutputDir = ""
)
Set-StrictMode -Version Latest
$ErrorActionPreference = "Stop"
function Get-TextByBestEncoding {
param(
[byte[]]$Bytes
)
$utf8 = [System.Text.Encoding]::UTF8.GetString($Bytes)
$gbk = [System.Text.Encoding]::GetEncoding(936).GetString($Bytes)
$cjkPattern = "[\u3400-\u4DBF\u4E00-\u9FFF\uF900-\uFAFF]"
$commonHan = "的是了我不在人有这中大来上个们到说和你地出道时年得就那要下以会可也后能子里所然文于着起看学"
$utf8CjkCount = [regex]::Matches($utf8, $cjkPattern).Count
$gbkCjkCount = [regex]::Matches($gbk, $cjkPattern).Count
$utf8CommonHit = 0
$gbkCommonHit = 0
foreach ($ch in $commonHan.ToCharArray()) {
$p = [regex]::Escape([string]$ch)
$utf8CommonHit += [regex]::Matches($utf8, $p).Count
$gbkCommonHit += [regex]::Matches($gbk, $p).Count
}
# Typical UTF-8 mojibake markers when decoded as GBK.
$mojibakeMarkers = @("锛", "涓", "鐨", "銆", "鍙", "鎴", "浠", "璇", "鏄", "鍚")
$utf8Mojibake = 0
$gbkMojibake = 0
foreach ($m in $mojibakeMarkers) {
$p = [regex]::Escape($m)
$utf8Mojibake += [regex]::Matches($utf8, $p).Count
$gbkMojibake += [regex]::Matches($gbk, $p).Count
}
$utf8Replacement = [regex]::Matches($utf8, "�").Count
$gbkReplacement = [regex]::Matches($gbk, "�").Count
$utf8Score = ($utf8CommonHit * 1000) + $utf8CjkCount - ($utf8Mojibake * 200) - ($utf8Replacement * 10)
$gbkScore = ($gbkCommonHit * 1000) + $gbkCjkCount - ($gbkMojibake * 200) - ($gbkReplacement * 10)
if ($gbkScore -gt $utf8Score) {
return @{
Text = $gbk
Encoding = "GBK(936)"
CjkCount = $gbkCjkCount
}
}
return @{
Text = $utf8
Encoding = "UTF-8"
CjkCount = $utf8CjkCount
}
}
function Add-CharsToSet {
param(
[System.Collections.Generic.HashSet[string]]$Set,
[System.Collections.Generic.List[string]]$Ordered,
[string]$Text
)
foreach ($ch in $Text.ToCharArray()) {
$s = [string]$ch
if ($Set.Add($s)) {
$Ordered.Add($s)
}
}
}
if (-not (Test-Path -LiteralPath $InputFile)) {
throw "Input file not found: $InputFile"
}
$inputFullPath = (Resolve-Path -LiteralPath $InputFile).Path
$scriptDir = Split-Path -Parent $MyInvocation.MyCommand.Path
if ([string]::IsNullOrWhiteSpace($OutputDir)) {
$OutputDir = Join-Path $scriptDir "..\assets\fonts\SourceHanSerifCN\subset"
}
$OutputDir = [System.IO.Path]::GetFullPath($OutputDir)
New-Item -Path $OutputDir -ItemType Directory -Force | Out-Null
$bytes = [System.IO.File]::ReadAllBytes($inputFullPath)
$decoded = Get-TextByBestEncoding -Bytes $bytes
$text = [string]$decoded.Text
$cjkPattern = "[\u3400-\u4DBF\u4E00-\u9FFF\uF900-\uFAFF]"
$matches = [regex]::Matches($text, $cjkPattern)
$freq = @{}
foreach ($m in $matches) {
$ch = $m.Value
if ($freq.ContainsKey($ch)) {
$freq[$ch] += 1
} else {
$freq[$ch] = 1
}
}
$sorted = $freq.GetEnumerator() | Sort-Object -Property @{ Expression = "Value"; Descending = $true }, @{ Expression = "Name"; Descending = $false }
$uniqueCjk = $sorted.Count
if ($TopChars -lt 1) {
$TopChars = 1
}
if ($TopChars -gt $uniqueCjk -and $uniqueCjk -gt 0) {
$TopChars = $uniqueCjk
}
$top = @()
if ($uniqueCjk -gt 0) {
$top = $sorted | Select-Object -First $TopChars
}
# Base ASCII set + commonly used Chinese punctuation.
$ascii = " !`"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_abcdefghijklmnopqrstuvwxyz{|}~"
$hanPunctCodePoints = @(
0x3001, 0x3002, 0xFF0C, 0xFF1A, 0xFF1B, 0xFF01, 0xFF1F,
0x300A, 0x300B, 0x300C, 0x300D, 0x300E, 0x300F,
0x201C, 0x201D, 0x2018, 0x2019, 0xFF08, 0xFF09,
0x3010, 0x3011, 0x2014, 0x2026, 0x00B7, 0x3008, 0x3009
)
$hanPunct = ""
foreach ($cp in $hanPunctCodePoints) {
$hanPunct += [char]$cp
}
$set = New-Object "System.Collections.Generic.HashSet[string]"
$ordered = New-Object "System.Collections.Generic.List[string]"
Add-CharsToSet -Set $set -Ordered $ordered -Text $ascii
Add-CharsToSet -Set $set -Ordered $ordered -Text $hanPunct
foreach ($item in $top) {
$ch = [string]$item.Name
if ($set.Add($ch)) {
$ordered.Add($ch)
}
}
$charsetCore = ($ordered -join "")
$charsetCorePath = Join-Path $OutputDir "charset-core.txt"
[System.IO.File]::WriteAllText($charsetCorePath, $charsetCore, [System.Text.Encoding]::UTF8)
$allCjkChars = ""
if ($uniqueCjk -gt 0) {
$allCjkChars = (($sorted | ForEach-Object { $_.Name }) -join "")
}
$charsetAllPath = Join-Path $OutputDir "charset-all-cjk.txt"
[System.IO.File]::WriteAllText($charsetAllPath, $allCjkChars, [System.Text.Encoding]::UTF8)
$totalCjkCount = 0
foreach ($item in $sorted) {
$totalCjkCount += [int]$item.Value
}
$csvPath = Join-Path $OutputDir "char-frequency.csv"
"rank,char,count,ratio,cumulative_ratio" | Out-File -FilePath $csvPath -Encoding UTF8
$acc = 0
$rank = 0
foreach ($item in $sorted) {
$rank += 1
$count = [int]$item.Value
$acc += $count
$ratio = if ($totalCjkCount -gt 0) { [math]::Round(($count / $totalCjkCount), 8) } else { 0 }
$cumRatio = if ($totalCjkCount -gt 0) { [math]::Round(($acc / $totalCjkCount), 8) } else { 0 }
"$rank,$($item.Name),$count,$ratio,$cumRatio" | Out-File -FilePath $csvPath -Encoding UTF8 -Append
}
$topCoverage = 0
if ($totalCjkCount -gt 0 -and $TopChars -gt 0) {
$topCount = ($sorted | Select-Object -First $TopChars | Measure-Object -Property Value -Sum).Sum
$topCoverage = [math]::Round(($topCount / $totalCjkCount) * 100, 4)
}
$reportPath = Join-Path $OutputDir "charset-report.txt"
$report = @(
"InputFile=$inputFullPath",
"DetectedEncoding=$($decoded.Encoding)",
"FileBytes=$($bytes.Length)",
"TotalCjkChars=$totalCjkCount",
"UniqueCjkChars=$uniqueCjk",
"TopChars=$TopChars",
"TopCoveragePercent=$topCoverage",
"OutputCharsetCore=$charsetCorePath",
"OutputCharsetAll=$charsetAllPath",
"OutputFrequencyCsv=$csvPath"
)
$report | Out-File -FilePath $reportPath -Encoding UTF8
Write-Output ("OK")
Write-Output ("Detected encoding: {0}" -f $decoded.Encoding)
Write-Output ("Total CJK chars: {0}" -f $totalCjkCount)
Write-Output ("Unique CJK chars: {0}" -f $uniqueCjk)
Write-Output ("Top {0} coverage: {1}%" -f $TopChars, $topCoverage)
Write-Output ("charset-core: {0}" -f $charsetCorePath)
Write-Output ("report: {0}" -f $reportPath)
.\extract-common-chars.ps1 `
-InputFile "...dat" `
-TopChars 800 `
-OutputDir "...subset-backup-800"生成
param(
[Parameter(Mandatory = $true)]
[string]$SourceFont,
[Parameter(Mandatory = $true)]
[string]$CoreCharsetFile,
[Parameter(Mandatory = $true)]
[string]$AllCharsetFile,
[string]$OutputDir = "",
[string]$PythonCommand = "python",
[int]$InstanceWeight = 400
)
Set-StrictMode -Version Latest
$ErrorActionPreference = "Stop"
function Resolve-ExistingPath {
param(
[Parameter(Mandatory = $true)]
[string]$Path
)
if (-not (Test-Path -LiteralPath $Path)) {
throw "Path not found: $Path"
}
return (Resolve-Path -LiteralPath $Path).Path
}
function Is-CjkCodePoint {
param(
[int]$CodePoint
)
return (
($CodePoint -ge 0x3400 -and $CodePoint -le 0x4DBF) -or
($CodePoint -ge 0x4E00 -and $CodePoint -le 0x9FFF) -or
($CodePoint -ge 0xF900 -and $CodePoint -le 0xFAFF)
)
}
function Get-UniqueChars {
param(
[string]$Text
)
$set = New-Object "System.Collections.Generic.HashSet[string]"
$ordered = New-Object "System.Collections.Generic.List[string]"
foreach ($ch in $Text.ToCharArray()) {
$s = [string]$ch
if ($set.Add($s)) {
$ordered.Add($s)
}
}
return ,$ordered.ToArray()
}
function Convert-CharsToUnicodeRange {
param(
[string[]]$Chars
)
if (-not $Chars -or $Chars.Count -eq 0) {
return ""
}
$codeSet = New-Object "System.Collections.Generic.HashSet[int]"
foreach ($ch in $Chars) {
if ([string]::IsNullOrEmpty($ch)) {
continue
}
[void]$codeSet.Add([int][char]$ch)
}
if ($codeSet.Count -eq 0) {
return ""
}
$codes = @($codeSet) | Sort-Object
$ranges = New-Object "System.Collections.Generic.List[string]"
$start = $codes[0]
$prev = $codes[0]
for ($i = 1; $i -lt $codes.Count; $i++) {
$curr = [int]$codes[$i]
if ($curr -eq ($prev + 1)) {
$prev = $curr
continue
}
if ($start -eq $prev) {
$ranges.Add(("U+{0}" -f $start.ToString("X")))
} else {
$ranges.Add(("U+{0}-{1}" -f @($start.ToString("X"), $prev.ToString("X"))))
}
$start = $curr
$prev = $curr
}
if ($start -eq $prev) {
$ranges.Add(("U+{0}" -f $start.ToString("X")))
} else {
$ranges.Add(("U+{0}-{1}" -f @($start.ToString("X"), $prev.ToString("X"))))
}
return ($ranges -join ",")
}
function Invoke-FontSubset {
param(
[string]$PythonCommand,
[string]$SourceFontPath,
[string]$OutputFilePath,
[string]$TextFilePath,
[string[]]$CommonArgs
)
$args = @(
"-m",
"fontTools.subset",
$SourceFontPath,
"--output-file=$OutputFilePath",
"--text-file=$TextFilePath"
) + $CommonArgs
& $PythonCommand @args
if ($LASTEXITCODE -ne 0) {
throw "Subset build failed ($OutputFilePath) with exit code $LASTEXITCODE"
}
}
$sourceFontPath = Resolve-ExistingPath -Path $SourceFont
$coreCharsetPath = Resolve-ExistingPath -Path $CoreCharsetFile
$allCharsetPath = Resolve-ExistingPath -Path $AllCharsetFile
if ([string]::IsNullOrWhiteSpace($OutputDir)) {
$OutputDir = Split-Path -Parent $coreCharsetPath
}
$outputDirFull = [System.IO.Path]::GetFullPath($OutputDir)
New-Item -ItemType Directory -Path $outputDirFull -Force | Out-Null
$utf8 = [System.Text.Encoding]::UTF8
$utf8NoBom = New-Object System.Text.UTF8Encoding($false)
$coreText = [System.IO.File]::ReadAllText($coreCharsetPath, $utf8)
$allText = [System.IO.File]::ReadAllText($allCharsetPath, $utf8)
$subsetSourceFontPath = $sourceFontPath
$fontWeightDescriptor = "250 900"
$instanceFontPath = ""
if ($InstanceWeight -gt 0) {
$instanceFontPath = Join-Path $outputDirFull ("source-wght-{0}.ttf" -f $InstanceWeight)
$instanceArgs = @(
"-m",
"fontTools.varLib.instancer",
$sourceFontPath,
("wght={0}" -f $InstanceWeight),
"--output",
$instanceFontPath
)
& $PythonCommand @instanceArgs
if ($LASTEXITCODE -ne 0) {
throw "Font instancer failed with exit code $LASTEXITCODE"
}
if (-not (Test-Path -LiteralPath $instanceFontPath)) {
throw "Instanced font not found: $instanceFontPath"
}
$subsetSourceFontPath = $instanceFontPath
$fontWeightDescriptor = [string]$InstanceWeight
}
$coreUniqueChars = Get-UniqueChars -Text $coreText
$coreCjkChars = New-Object "System.Collections.Generic.List[string]"
$coreMiscChars = New-Object "System.Collections.Generic.List[string]"
foreach ($ch in $coreUniqueChars) {
if ([string]::IsNullOrEmpty($ch)) {
continue
}
$cp = [int][char]$ch
if (Is-CjkCodePoint -CodePoint $cp) {
$coreCjkChars.Add($ch)
} else {
$coreMiscChars.Add($ch)
}
}
$coreCjkText = $coreCjkChars -join ""
$coreMiscText = $coreMiscChars -join ""
$coreCjkCharsetPath = Join-Path $outputDirFull "charset-core-cjk.txt"
$coreMiscCharsetPath = Join-Path $outputDirFull "charset-core-misc.txt"
[System.IO.File]::WriteAllText($coreCjkCharsetPath, $coreCjkText, $utf8NoBom)
[System.IO.File]::WriteAllText($coreMiscCharsetPath, $coreMiscText, $utf8NoBom)
$coreSet = New-Object "System.Collections.Generic.HashSet[string]"
foreach ($ch in $coreText.ToCharArray()) {
[void]$coreSet.Add([string]$ch)
}
$fallbackChars = New-Object "System.Collections.Generic.List[string]"
foreach ($ch in $allText.ToCharArray()) {
$s = [string]$ch
if (-not $coreSet.Contains($s)) {
$fallbackChars.Add($s)
}
}
$fallbackText = $fallbackChars -join ""
$fallbackCharsetPath = Join-Path $outputDirFull "charset-fallback.txt"
[System.IO.File]::WriteAllText($fallbackCharsetPath, $fallbackText, $utf8NoBom)
$commonSubsetArgs = @(
"--flavor=woff2",
"--layout-features=*",
"--name-IDs=*",
"--name-legacy",
"--name-languages=*",
"--notdef-glyph",
"--notdef-outline",
"--recommended-glyphs",
"--symbol-cmap",
"--legacy-cmap",
"--no-hinting"
)
$coreCjkWoff2Path = Join-Path $outputDirFull "core-cjk.woff2"
$coreMiscWoff2Path = Join-Path $outputDirFull "core-misc.woff2"
$fallbackWoff2Path = Join-Path $outputDirFull "fallback.woff2"
if ($coreCjkChars.Count -gt 0) {
Invoke-FontSubset `
-PythonCommand $PythonCommand `
-SourceFontPath $subsetSourceFontPath `
-OutputFilePath $coreCjkWoff2Path `
-TextFilePath $coreCjkCharsetPath `
-CommonArgs $commonSubsetArgs
}
if ($coreMiscChars.Count -gt 0) {
Invoke-FontSubset `
-PythonCommand $PythonCommand `
-SourceFontPath $subsetSourceFontPath `
-OutputFilePath $coreMiscWoff2Path `
-TextFilePath $coreMiscCharsetPath `
-CommonArgs $commonSubsetArgs
}
if ($fallbackText.Length -gt 0) {
Invoke-FontSubset `
-PythonCommand $PythonCommand `
-SourceFontPath $subsetSourceFontPath `
-OutputFilePath $fallbackWoff2Path `
-TextFilePath $fallbackCharsetPath `
-CommonArgs $commonSubsetArgs
}
$subsetCssPath = Join-Path $outputDirFull "subset-font.css"
[System.Collections.Generic.List[string]]$cssLines = @()
if ($coreCjkChars.Count -gt 0) {
$cssLines.Add("@font-face {")
$cssLines.Add(" font-family: `"HJ Source Han Serif Core`";")
$cssLines.Add(" src: url(`"./core-cjk.woff2`") format(`"woff2`");")
$cssLines.Add(" font-style: normal;")
$cssLines.Add(" font-weight: $fontWeightDescriptor;")
$cssLines.Add(" font-display: swap;")
$cssLines.Add(" unicode-range: $(Convert-CharsToUnicodeRange -Chars $coreCjkChars.ToArray());")
$cssLines.Add("}")
$cssLines.Add("")
}
if ($coreMiscChars.Count -gt 0) {
$cssLines.Add("@font-face {")
$cssLines.Add(" font-family: `"HJ Source Han Serif Core`";")
$cssLines.Add(" src: url(`"./core-misc.woff2`") format(`"woff2`");")
$cssLines.Add(" font-style: normal;")
$cssLines.Add(" font-weight: $fontWeightDescriptor;")
$cssLines.Add(" font-display: swap;")
$cssLines.Add(" unicode-range: $(Convert-CharsToUnicodeRange -Chars $coreMiscChars.ToArray());")
$cssLines.Add("}")
$cssLines.Add("")
}
$fallbackUniqueChars = Get-UniqueChars -Text $fallbackText
if ($fallbackUniqueChars.Count -gt 0) {
$cssLines.Add("@font-face {")
$cssLines.Add(" font-family: `"HJ Source Han Serif Fallback`";")
$cssLines.Add(" src: url(`"./fallback.woff2`") format(`"woff2`");")
$cssLines.Add(" font-style: normal;")
$cssLines.Add(" font-weight: $fontWeightDescriptor;")
$cssLines.Add(" font-display: swap;")
$cssLines.Add(" unicode-range: $(Convert-CharsToUnicodeRange -Chars $fallbackUniqueChars);")
$cssLines.Add("}")
}
$subsetCss = ($cssLines -join [Environment]::NewLine)
[System.IO.File]::WriteAllText($subsetCssPath, $subsetCss, $utf8NoBom)
$coreCjkBytes = if (Test-Path -LiteralPath $coreCjkWoff2Path) { (Get-Item -LiteralPath $coreCjkWoff2Path).Length } else { 0 }
$coreMiscBytes = if (Test-Path -LiteralPath $coreMiscWoff2Path) { (Get-Item -LiteralPath $coreMiscWoff2Path).Length } else { 0 }
$fallbackBytes = if (Test-Path -LiteralPath $fallbackWoff2Path) { (Get-Item -LiteralPath $fallbackWoff2Path).Length } else { 0 }
$reportPath = Join-Path $outputDirFull "subset-build-report.txt"
$reportLines = @(
"SourceFont=$sourceFontPath",
"SubsetSourceFont=$subsetSourceFontPath",
"InstanceWeight=$InstanceWeight",
"CoreCharset=$coreCharsetPath",
"CoreCjkCharset=$coreCjkCharsetPath",
"CoreMiscCharset=$coreMiscCharsetPath",
"AllCharset=$allCharsetPath",
"FallbackCharset=$fallbackCharsetPath",
"CoreChars=$($coreText.Length)",
"CoreCjkChars=$($coreCjkText.Length)",
"CoreMiscChars=$($coreMiscText.Length)",
"FallbackChars=$($fallbackText.Length)",
"CoreCjkWoff2=$coreCjkWoff2Path",
"CoreCjkWoff2Bytes=$coreCjkBytes",
"CoreMiscWoff2=$coreMiscWoff2Path",
"CoreMiscWoff2Bytes=$coreMiscBytes",
"FallbackWoff2=$fallbackWoff2Path",
"FallbackWoff2Bytes=$fallbackBytes",
"TotalSubsetBytes=$($coreCjkBytes + $coreMiscBytes + $fallbackBytes)",
"SubsetCss=$subsetCssPath"
)
[System.IO.File]::WriteAllLines($reportPath, $reportLines, $utf8NoBom)
if (-not [string]::IsNullOrWhiteSpace($instanceFontPath) -and (Test-Path -LiteralPath $instanceFontPath)) {
try {
Remove-Item -LiteralPath $instanceFontPath -Force -ErrorAction Stop
} catch {
# Keep build successful even if temp instance cleanup fails.
}
}
Write-Output "OK"
Write-Output "Core CJK woff2: $coreCjkWoff2Path ($coreCjkBytes bytes)"
Write-Output "Core Misc woff2: $coreMiscWoff2Path ($coreMiscBytes bytes)"
Write-Output "Fallback woff2: $fallbackWoff2Path ($fallbackBytes bytes)"
Write-Output "Subset css: $subsetCssPath"
Write-Output "Report: $reportPath"
.\build-font-subsets.ps1 `
-SourceFont "SourceHanSerifCN.ttf" `
-CoreCharsetFile "...charset-core.txt" `
-AllCharsetFile "...charset-all-cjk.txt" `
-OutputDir "...subset-backup-800" `
-InstanceWeight 400结果
subset-backup-800/
├── core-cjk.woff2
├── core-misc.woff2
├── fallback.woff2
├── subset-font.css
├── charset-core.txt
└── charset-all-cjk.txt调用
<link rel="stylesheet" href="<?php $this->options->themeUrl('assets/fonts/SourceHanSerifCN/subset-backup-800/subset-font.css'); ?>">body {
font-family: "HJ Source Han Serif Core", "HJ Source Han Serif Fallback", serif;
}