用Typecho备份文件构建博客专属字库

在优化博客性能时,字体往往是一个被忽略却体积巨大的资源。以思源宋体为例,完整字体动辄十几 MB,即使分包后依然不小。对于以文字为主的博客来说,这是完全可以进一步优化的。

结果:

文件名类型大小
core-misc.woff2woff239.68 kB
core-cjk.woff2woff2141.33 kB
fallback.woff2woff296.26 kB

思路:

  1. 从网站备份中提取高频字符(或者使用3500常用字1
  2. 基于提取结果裁剪字体,生成子集
[!NOTE]
不套用“3500的常用字”,而是统计真正用过的,这样生成的字体子集,既保证文章正常显示,又最大程度缩小体积。

1. 从备份中提取常用字

通过脚本解析网站备份文件:

  1. 统计字符频率
  2. 提取前 800 个高频字
  3. 同时收集全部出现过的 CJK 字符

生成两个字符集文件: charset-core.txtcharset-all-cjk.txt

2. 生成字体子集

利用 fonttools 对原始思源宋体进行裁剪输出:

core-cjk.woff2(高频汉字)
core-misc.woff2(标点、符号、ASCII)
fallback.woff2(其余 CJK 字符)
subset-font.css(自动生成的 font-face 配置)

3. 脚本参考

提取

param(
    [Parameter(Mandatory = $true)]
    [string]$InputFile,

    [int]$TopChars = 4000,

    [string]$OutputDir = ""
)

Set-StrictMode -Version Latest
$ErrorActionPreference = "Stop"

function Get-TextByBestEncoding {
    param(
        [byte[]]$Bytes
    )

    $utf8 = [System.Text.Encoding]::UTF8.GetString($Bytes)
    $gbk = [System.Text.Encoding]::GetEncoding(936).GetString($Bytes)

    $cjkPattern = "[\u3400-\u4DBF\u4E00-\u9FFF\uF900-\uFAFF]"
    $commonHan = "的是了我不在人有这中大来上个们到说和你地出道时年得就那要下以会可也后能子里所然文于着起看学"

    $utf8CjkCount = [regex]::Matches($utf8, $cjkPattern).Count
    $gbkCjkCount = [regex]::Matches($gbk, $cjkPattern).Count

    $utf8CommonHit = 0
    $gbkCommonHit = 0
    foreach ($ch in $commonHan.ToCharArray()) {
        $p = [regex]::Escape([string]$ch)
        $utf8CommonHit += [regex]::Matches($utf8, $p).Count
        $gbkCommonHit += [regex]::Matches($gbk, $p).Count
    }

    # Typical UTF-8 mojibake markers when decoded as GBK.
    $mojibakeMarkers = @("锛", "涓", "鐨", "銆", "鍙", "鎴", "浠", "璇", "鏄", "鍚")
    $utf8Mojibake = 0
    $gbkMojibake = 0
    foreach ($m in $mojibakeMarkers) {
        $p = [regex]::Escape($m)
        $utf8Mojibake += [regex]::Matches($utf8, $p).Count
        $gbkMojibake += [regex]::Matches($gbk, $p).Count
    }

    $utf8Replacement = [regex]::Matches($utf8, "�").Count
    $gbkReplacement = [regex]::Matches($gbk, "�").Count

    $utf8Score = ($utf8CommonHit * 1000) + $utf8CjkCount - ($utf8Mojibake * 200) - ($utf8Replacement * 10)
    $gbkScore = ($gbkCommonHit * 1000) + $gbkCjkCount - ($gbkMojibake * 200) - ($gbkReplacement * 10)

    if ($gbkScore -gt $utf8Score) {
        return @{
            Text = $gbk
            Encoding = "GBK(936)"
            CjkCount = $gbkCjkCount
        }
    }

    return @{
        Text = $utf8
        Encoding = "UTF-8"
        CjkCount = $utf8CjkCount
    }
}

function Add-CharsToSet {
    param(
        [System.Collections.Generic.HashSet[string]]$Set,
        [System.Collections.Generic.List[string]]$Ordered,
        [string]$Text
    )

    foreach ($ch in $Text.ToCharArray()) {
        $s = [string]$ch
        if ($Set.Add($s)) {
            $Ordered.Add($s)
        }
    }
}

if (-not (Test-Path -LiteralPath $InputFile)) {
    throw "Input file not found: $InputFile"
}

$inputFullPath = (Resolve-Path -LiteralPath $InputFile).Path
$scriptDir = Split-Path -Parent $MyInvocation.MyCommand.Path

if ([string]::IsNullOrWhiteSpace($OutputDir)) {
    $OutputDir = Join-Path $scriptDir "..\assets\fonts\SourceHanSerifCN\subset"
}

$OutputDir = [System.IO.Path]::GetFullPath($OutputDir)
New-Item -Path $OutputDir -ItemType Directory -Force | Out-Null

$bytes = [System.IO.File]::ReadAllBytes($inputFullPath)
$decoded = Get-TextByBestEncoding -Bytes $bytes
$text = [string]$decoded.Text

$cjkPattern = "[\u3400-\u4DBF\u4E00-\u9FFF\uF900-\uFAFF]"
$matches = [regex]::Matches($text, $cjkPattern)

$freq = @{}
foreach ($m in $matches) {
    $ch = $m.Value
    if ($freq.ContainsKey($ch)) {
        $freq[$ch] += 1
    } else {
        $freq[$ch] = 1
    }
}

$sorted = $freq.GetEnumerator() | Sort-Object -Property @{ Expression = "Value"; Descending = $true }, @{ Expression = "Name"; Descending = $false }
$uniqueCjk = $sorted.Count

if ($TopChars -lt 1) {
    $TopChars = 1
}
if ($TopChars -gt $uniqueCjk -and $uniqueCjk -gt 0) {
    $TopChars = $uniqueCjk
}

$top = @()
if ($uniqueCjk -gt 0) {
    $top = $sorted | Select-Object -First $TopChars
}

# Base ASCII set + commonly used Chinese punctuation.
$ascii = " !`"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_abcdefghijklmnopqrstuvwxyz{|}~"
$hanPunctCodePoints = @(
    0x3001, 0x3002, 0xFF0C, 0xFF1A, 0xFF1B, 0xFF01, 0xFF1F,
    0x300A, 0x300B, 0x300C, 0x300D, 0x300E, 0x300F,
    0x201C, 0x201D, 0x2018, 0x2019, 0xFF08, 0xFF09,
    0x3010, 0x3011, 0x2014, 0x2026, 0x00B7, 0x3008, 0x3009
)

$hanPunct = ""
foreach ($cp in $hanPunctCodePoints) {
    $hanPunct += [char]$cp
}

$set = New-Object "System.Collections.Generic.HashSet[string]"
$ordered = New-Object "System.Collections.Generic.List[string]"

Add-CharsToSet -Set $set -Ordered $ordered -Text $ascii
Add-CharsToSet -Set $set -Ordered $ordered -Text $hanPunct

foreach ($item in $top) {
    $ch = [string]$item.Name
    if ($set.Add($ch)) {
        $ordered.Add($ch)
    }
}

$charsetCore = ($ordered -join "")
$charsetCorePath = Join-Path $OutputDir "charset-core.txt"
[System.IO.File]::WriteAllText($charsetCorePath, $charsetCore, [System.Text.Encoding]::UTF8)

$allCjkChars = ""
if ($uniqueCjk -gt 0) {
    $allCjkChars = (($sorted | ForEach-Object { $_.Name }) -join "")
}
$charsetAllPath = Join-Path $OutputDir "charset-all-cjk.txt"
[System.IO.File]::WriteAllText($charsetAllPath, $allCjkChars, [System.Text.Encoding]::UTF8)

$totalCjkCount = 0
foreach ($item in $sorted) {
    $totalCjkCount += [int]$item.Value
}

$csvPath = Join-Path $OutputDir "char-frequency.csv"
"rank,char,count,ratio,cumulative_ratio" | Out-File -FilePath $csvPath -Encoding UTF8
$acc = 0
$rank = 0
foreach ($item in $sorted) {
    $rank += 1
    $count = [int]$item.Value
    $acc += $count
    $ratio = if ($totalCjkCount -gt 0) { [math]::Round(($count / $totalCjkCount), 8) } else { 0 }
    $cumRatio = if ($totalCjkCount -gt 0) { [math]::Round(($acc / $totalCjkCount), 8) } else { 0 }
    "$rank,$($item.Name),$count,$ratio,$cumRatio" | Out-File -FilePath $csvPath -Encoding UTF8 -Append
}

$topCoverage = 0
if ($totalCjkCount -gt 0 -and $TopChars -gt 0) {
    $topCount = ($sorted | Select-Object -First $TopChars | Measure-Object -Property Value -Sum).Sum
    $topCoverage = [math]::Round(($topCount / $totalCjkCount) * 100, 4)
}

$reportPath = Join-Path $OutputDir "charset-report.txt"
$report = @(
    "InputFile=$inputFullPath",
    "DetectedEncoding=$($decoded.Encoding)",
    "FileBytes=$($bytes.Length)",
    "TotalCjkChars=$totalCjkCount",
    "UniqueCjkChars=$uniqueCjk",
    "TopChars=$TopChars",
    "TopCoveragePercent=$topCoverage",
    "OutputCharsetCore=$charsetCorePath",
    "OutputCharsetAll=$charsetAllPath",
    "OutputFrequencyCsv=$csvPath"
)
$report | Out-File -FilePath $reportPath -Encoding UTF8

Write-Output ("OK")
Write-Output ("Detected encoding: {0}" -f $decoded.Encoding)
Write-Output ("Total CJK chars: {0}" -f $totalCjkCount)
Write-Output ("Unique CJK chars: {0}" -f $uniqueCjk)
Write-Output ("Top {0} coverage: {1}%" -f $TopChars, $topCoverage)
Write-Output ("charset-core: {0}" -f $charsetCorePath)
Write-Output ("report: {0}" -f $reportPath)
.\extract-common-chars.ps1 `
  -InputFile "...dat" `
  -TopChars 800 `
  -OutputDir "...subset-backup-800"

生成

param(
    [Parameter(Mandatory = $true)]
    [string]$SourceFont,

    [Parameter(Mandatory = $true)]
    [string]$CoreCharsetFile,

    [Parameter(Mandatory = $true)]
    [string]$AllCharsetFile,

    [string]$OutputDir = "",
    [string]$PythonCommand = "python",
    [int]$InstanceWeight = 400
)

Set-StrictMode -Version Latest
$ErrorActionPreference = "Stop"

function Resolve-ExistingPath {
    param(
        [Parameter(Mandatory = $true)]
        [string]$Path
    )

    if (-not (Test-Path -LiteralPath $Path)) {
        throw "Path not found: $Path"
    }

    return (Resolve-Path -LiteralPath $Path).Path
}

function Is-CjkCodePoint {
    param(
        [int]$CodePoint
    )

    return (
        ($CodePoint -ge 0x3400 -and $CodePoint -le 0x4DBF) -or
        ($CodePoint -ge 0x4E00 -and $CodePoint -le 0x9FFF) -or
        ($CodePoint -ge 0xF900 -and $CodePoint -le 0xFAFF)
    )
}

function Get-UniqueChars {
    param(
        [string]$Text
    )

    $set = New-Object "System.Collections.Generic.HashSet[string]"
    $ordered = New-Object "System.Collections.Generic.List[string]"

    foreach ($ch in $Text.ToCharArray()) {
        $s = [string]$ch
        if ($set.Add($s)) {
            $ordered.Add($s)
        }
    }

    return ,$ordered.ToArray()
}

function Convert-CharsToUnicodeRange {
    param(
        [string[]]$Chars
    )

    if (-not $Chars -or $Chars.Count -eq 0) {
        return ""
    }

    $codeSet = New-Object "System.Collections.Generic.HashSet[int]"
    foreach ($ch in $Chars) {
        if ([string]::IsNullOrEmpty($ch)) {
            continue
        }
        [void]$codeSet.Add([int][char]$ch)
    }

    if ($codeSet.Count -eq 0) {
        return ""
    }

    $codes = @($codeSet) | Sort-Object
    $ranges = New-Object "System.Collections.Generic.List[string]"

    $start = $codes[0]
    $prev = $codes[0]

    for ($i = 1; $i -lt $codes.Count; $i++) {
        $curr = [int]$codes[$i]
        if ($curr -eq ($prev + 1)) {
            $prev = $curr
            continue
        }

        if ($start -eq $prev) {
            $ranges.Add(("U+{0}" -f $start.ToString("X")))
        } else {
            $ranges.Add(("U+{0}-{1}" -f @($start.ToString("X"), $prev.ToString("X"))))
        }

        $start = $curr
        $prev = $curr
    }

    if ($start -eq $prev) {
        $ranges.Add(("U+{0}" -f $start.ToString("X")))
    } else {
        $ranges.Add(("U+{0}-{1}" -f @($start.ToString("X"), $prev.ToString("X"))))
    }

    return ($ranges -join ",")
}

function Invoke-FontSubset {
    param(
        [string]$PythonCommand,
        [string]$SourceFontPath,
        [string]$OutputFilePath,
        [string]$TextFilePath,
        [string[]]$CommonArgs
    )

    $args = @(
        "-m",
        "fontTools.subset",
        $SourceFontPath,
        "--output-file=$OutputFilePath",
        "--text-file=$TextFilePath"
    ) + $CommonArgs

    & $PythonCommand @args
    if ($LASTEXITCODE -ne 0) {
        throw "Subset build failed ($OutputFilePath) with exit code $LASTEXITCODE"
    }
}

$sourceFontPath = Resolve-ExistingPath -Path $SourceFont
$coreCharsetPath = Resolve-ExistingPath -Path $CoreCharsetFile
$allCharsetPath = Resolve-ExistingPath -Path $AllCharsetFile

if ([string]::IsNullOrWhiteSpace($OutputDir)) {
    $OutputDir = Split-Path -Parent $coreCharsetPath
}

$outputDirFull = [System.IO.Path]::GetFullPath($OutputDir)
New-Item -ItemType Directory -Path $outputDirFull -Force | Out-Null

$utf8 = [System.Text.Encoding]::UTF8
$utf8NoBom = New-Object System.Text.UTF8Encoding($false)

$coreText = [System.IO.File]::ReadAllText($coreCharsetPath, $utf8)
$allText = [System.IO.File]::ReadAllText($allCharsetPath, $utf8)

$subsetSourceFontPath = $sourceFontPath
$fontWeightDescriptor = "250 900"
$instanceFontPath = ""
if ($InstanceWeight -gt 0) {
    $instanceFontPath = Join-Path $outputDirFull ("source-wght-{0}.ttf" -f $InstanceWeight)
    $instanceArgs = @(
        "-m",
        "fontTools.varLib.instancer",
        $sourceFontPath,
        ("wght={0}" -f $InstanceWeight),
        "--output",
        $instanceFontPath
    )
    & $PythonCommand @instanceArgs
    if ($LASTEXITCODE -ne 0) {
        throw "Font instancer failed with exit code $LASTEXITCODE"
    }
    if (-not (Test-Path -LiteralPath $instanceFontPath)) {
        throw "Instanced font not found: $instanceFontPath"
    }
    $subsetSourceFontPath = $instanceFontPath
    $fontWeightDescriptor = [string]$InstanceWeight
}

$coreUniqueChars = Get-UniqueChars -Text $coreText
$coreCjkChars = New-Object "System.Collections.Generic.List[string]"
$coreMiscChars = New-Object "System.Collections.Generic.List[string]"
foreach ($ch in $coreUniqueChars) {
    if ([string]::IsNullOrEmpty($ch)) {
        continue
    }
    $cp = [int][char]$ch
    if (Is-CjkCodePoint -CodePoint $cp) {
        $coreCjkChars.Add($ch)
    } else {
        $coreMiscChars.Add($ch)
    }
}

$coreCjkText = $coreCjkChars -join ""
$coreMiscText = $coreMiscChars -join ""

$coreCjkCharsetPath = Join-Path $outputDirFull "charset-core-cjk.txt"
$coreMiscCharsetPath = Join-Path $outputDirFull "charset-core-misc.txt"
[System.IO.File]::WriteAllText($coreCjkCharsetPath, $coreCjkText, $utf8NoBom)
[System.IO.File]::WriteAllText($coreMiscCharsetPath, $coreMiscText, $utf8NoBom)

$coreSet = New-Object "System.Collections.Generic.HashSet[string]"
foreach ($ch in $coreText.ToCharArray()) {
    [void]$coreSet.Add([string]$ch)
}

$fallbackChars = New-Object "System.Collections.Generic.List[string]"
foreach ($ch in $allText.ToCharArray()) {
    $s = [string]$ch
    if (-not $coreSet.Contains($s)) {
        $fallbackChars.Add($s)
    }
}

$fallbackText = $fallbackChars -join ""
$fallbackCharsetPath = Join-Path $outputDirFull "charset-fallback.txt"
[System.IO.File]::WriteAllText($fallbackCharsetPath, $fallbackText, $utf8NoBom)

$commonSubsetArgs = @(
    "--flavor=woff2",
    "--layout-features=*",
    "--name-IDs=*",
    "--name-legacy",
    "--name-languages=*",
    "--notdef-glyph",
    "--notdef-outline",
    "--recommended-glyphs",
    "--symbol-cmap",
    "--legacy-cmap",
    "--no-hinting"
)

$coreCjkWoff2Path = Join-Path $outputDirFull "core-cjk.woff2"
$coreMiscWoff2Path = Join-Path $outputDirFull "core-misc.woff2"
$fallbackWoff2Path = Join-Path $outputDirFull "fallback.woff2"

if ($coreCjkChars.Count -gt 0) {
    Invoke-FontSubset `
        -PythonCommand $PythonCommand `
        -SourceFontPath $subsetSourceFontPath `
        -OutputFilePath $coreCjkWoff2Path `
        -TextFilePath $coreCjkCharsetPath `
        -CommonArgs $commonSubsetArgs
}

if ($coreMiscChars.Count -gt 0) {
    Invoke-FontSubset `
        -PythonCommand $PythonCommand `
        -SourceFontPath $subsetSourceFontPath `
        -OutputFilePath $coreMiscWoff2Path `
        -TextFilePath $coreMiscCharsetPath `
        -CommonArgs $commonSubsetArgs
}

if ($fallbackText.Length -gt 0) {
    Invoke-FontSubset `
        -PythonCommand $PythonCommand `
        -SourceFontPath $subsetSourceFontPath `
        -OutputFilePath $fallbackWoff2Path `
        -TextFilePath $fallbackCharsetPath `
        -CommonArgs $commonSubsetArgs
}

$subsetCssPath = Join-Path $outputDirFull "subset-font.css"
[System.Collections.Generic.List[string]]$cssLines = @()
if ($coreCjkChars.Count -gt 0) {
    $cssLines.Add("@font-face {")
    $cssLines.Add("    font-family: `"HJ Source Han Serif Core`";")
    $cssLines.Add("    src: url(`"./core-cjk.woff2`") format(`"woff2`");")
    $cssLines.Add("    font-style: normal;")
    $cssLines.Add("    font-weight: $fontWeightDescriptor;")
    $cssLines.Add("    font-display: swap;")
    $cssLines.Add("    unicode-range: $(Convert-CharsToUnicodeRange -Chars $coreCjkChars.ToArray());")
    $cssLines.Add("}")
    $cssLines.Add("")
}

if ($coreMiscChars.Count -gt 0) {
    $cssLines.Add("@font-face {")
    $cssLines.Add("    font-family: `"HJ Source Han Serif Core`";")
    $cssLines.Add("    src: url(`"./core-misc.woff2`") format(`"woff2`");")
    $cssLines.Add("    font-style: normal;")
    $cssLines.Add("    font-weight: $fontWeightDescriptor;")
    $cssLines.Add("    font-display: swap;")
    $cssLines.Add("    unicode-range: $(Convert-CharsToUnicodeRange -Chars $coreMiscChars.ToArray());")
    $cssLines.Add("}")
    $cssLines.Add("")
}

$fallbackUniqueChars = Get-UniqueChars -Text $fallbackText
if ($fallbackUniqueChars.Count -gt 0) {
    $cssLines.Add("@font-face {")
    $cssLines.Add("    font-family: `"HJ Source Han Serif Fallback`";")
    $cssLines.Add("    src: url(`"./fallback.woff2`") format(`"woff2`");")
    $cssLines.Add("    font-style: normal;")
    $cssLines.Add("    font-weight: $fontWeightDescriptor;")
    $cssLines.Add("    font-display: swap;")
    $cssLines.Add("    unicode-range: $(Convert-CharsToUnicodeRange -Chars $fallbackUniqueChars);")
    $cssLines.Add("}")
}

$subsetCss = ($cssLines -join [Environment]::NewLine)
[System.IO.File]::WriteAllText($subsetCssPath, $subsetCss, $utf8NoBom)

$coreCjkBytes = if (Test-Path -LiteralPath $coreCjkWoff2Path) { (Get-Item -LiteralPath $coreCjkWoff2Path).Length } else { 0 }
$coreMiscBytes = if (Test-Path -LiteralPath $coreMiscWoff2Path) { (Get-Item -LiteralPath $coreMiscWoff2Path).Length } else { 0 }
$fallbackBytes = if (Test-Path -LiteralPath $fallbackWoff2Path) { (Get-Item -LiteralPath $fallbackWoff2Path).Length } else { 0 }

$reportPath = Join-Path $outputDirFull "subset-build-report.txt"
$reportLines = @(
    "SourceFont=$sourceFontPath",
    "SubsetSourceFont=$subsetSourceFontPath",
    "InstanceWeight=$InstanceWeight",
    "CoreCharset=$coreCharsetPath",
    "CoreCjkCharset=$coreCjkCharsetPath",
    "CoreMiscCharset=$coreMiscCharsetPath",
    "AllCharset=$allCharsetPath",
    "FallbackCharset=$fallbackCharsetPath",
    "CoreChars=$($coreText.Length)",
    "CoreCjkChars=$($coreCjkText.Length)",
    "CoreMiscChars=$($coreMiscText.Length)",
    "FallbackChars=$($fallbackText.Length)",
    "CoreCjkWoff2=$coreCjkWoff2Path",
    "CoreCjkWoff2Bytes=$coreCjkBytes",
    "CoreMiscWoff2=$coreMiscWoff2Path",
    "CoreMiscWoff2Bytes=$coreMiscBytes",
    "FallbackWoff2=$fallbackWoff2Path",
    "FallbackWoff2Bytes=$fallbackBytes",
    "TotalSubsetBytes=$($coreCjkBytes + $coreMiscBytes + $fallbackBytes)",
    "SubsetCss=$subsetCssPath"
)
[System.IO.File]::WriteAllLines($reportPath, $reportLines, $utf8NoBom)

if (-not [string]::IsNullOrWhiteSpace($instanceFontPath) -and (Test-Path -LiteralPath $instanceFontPath)) {
    try {
        Remove-Item -LiteralPath $instanceFontPath -Force -ErrorAction Stop
    } catch {
        # Keep build successful even if temp instance cleanup fails.
    }
}

Write-Output "OK"
Write-Output "Core CJK woff2: $coreCjkWoff2Path ($coreCjkBytes bytes)"
Write-Output "Core Misc woff2: $coreMiscWoff2Path ($coreMiscBytes bytes)"
Write-Output "Fallback woff2: $fallbackWoff2Path ($fallbackBytes bytes)"
Write-Output "Subset css: $subsetCssPath"
Write-Output "Report: $reportPath"
.\build-font-subsets.ps1 `
  -SourceFont "SourceHanSerifCN.ttf" `
  -CoreCharsetFile "...charset-core.txt" `
  -AllCharsetFile "...charset-all-cjk.txt" `
  -OutputDir "...subset-backup-800" `
  -InstanceWeight 400

结果

subset-backup-800/
 ├── core-cjk.woff2
 ├── core-misc.woff2
 ├── fallback.woff2
 ├── subset-font.css
 ├── charset-core.txt
 └── charset-all-cjk.txt

调用

<link rel="stylesheet" href="<?php $this->options->themeUrl('assets/fonts/SourceHanSerifCN/subset-backup-800/subset-font.css'); ?>">
body {
  font-family: "HJ Source Han Serif Core", "HJ Source Han Serif Fallback", serif;
}

猜你想看