Skip to main content

futu_auth/
store.rs

1//! KeyStore: keys.json 加载 + 热替换 + 明文验证
2
3use std::collections::HashMap;
4use std::fs;
5use std::path::{Path, PathBuf};
6use std::sync::Arc;
7
8use arc_swap::ArcSwap;
9use chrono::Utc;
10use serde::{Deserialize, Serialize};
11
12use crate::key::{KeyRecord, hash_plaintext};
13
14#[derive(Debug, thiserror::Error)]
15#[non_exhaustive]
16pub enum KeyStoreError {
17    #[error("read {path:?}: {source}")]
18    Read {
19        path: PathBuf,
20        source: std::io::Error,
21    },
22    #[error("parse {path:?}: {source}")]
23    Parse {
24        path: PathBuf,
25        source: serde_json::Error,
26    },
27    #[error("write {path:?}: {source}")]
28    Write {
29        path: PathBuf,
30        source: std::io::Error,
31    },
32    #[error("serialize: {0}")]
33    Serialize(#[from] serde_json::Error),
34    #[error("unsupported keys.json version {0} (supported: 1)")]
35    UnsupportedVersion(u32),
36    #[error("duplicate key id {0:?}")]
37    DuplicateId(String),
38}
39
40/// keys.json 顶层文件结构
41#[derive(Debug, Clone, Serialize, Deserialize)]
42pub struct KeysFile {
43    pub version: u32,
44    pub keys: Vec<KeyRecord>,
45}
46
47const CURRENT_VERSION: u32 = 1;
48
49/// KeyStore:热可替换的 keys 集合
50#[derive(Debug)]
51pub struct KeyStore {
52    path: Option<PathBuf>,
53    current: ArcSwap<KeysFile>,
54    hash_index: ArcSwap<HashMap<String, Vec<KeyRecord>>>,
55}
56
57impl KeyStore {
58    /// 空 store(没有 keys 文件时)
59    pub fn empty() -> Self {
60        let file = KeysFile {
61            version: CURRENT_VERSION,
62            keys: vec![],
63        };
64        let hash_index = Self::build_hash_index(&file);
65        Self {
66            path: None,
67            current: ArcSwap::from_pointee(file),
68            hash_index: ArcSwap::from_pointee(hash_index),
69        }
70    }
71
72    /// 从文件加载
73    pub fn load(path: impl Into<PathBuf>) -> Result<Self, KeyStoreError> {
74        let path = path.into();
75        let file = Self::load_file(&path)?;
76        let hash_index = Self::build_hash_index(&file);
77        Ok(Self {
78            path: Some(path),
79            current: ArcSwap::from_pointee(file),
80            hash_index: ArcSwap::from_pointee(hash_index),
81        })
82    }
83
84    fn build_hash_index(file: &KeysFile) -> HashMap<String, Vec<KeyRecord>> {
85        let mut index: HashMap<String, Vec<KeyRecord>> = HashMap::with_capacity(file.keys.len());
86        for rec in &file.keys {
87            index.entry(rec.hash.clone()).or_default().push(rec.clone());
88        }
89        index
90    }
91
92    fn load_file(path: &Path) -> Result<KeysFile, KeyStoreError> {
93        if !path.exists() {
94            return Self::load_file_unlocked(path);
95        }
96        let _guard = AdvisoryLockGuard::acquire_shared(path)?;
97        Self::load_file_unlocked(path)
98    }
99
100    fn load_file_unlocked(path: &Path) -> Result<KeysFile, KeyStoreError> {
101        let text = fs::read_to_string(path).map_err(|source| KeyStoreError::Read {
102            path: path.to_path_buf(),
103            source,
104        })?;
105        let mut file: KeysFile =
106            serde_json::from_str(&text).map_err(|source| KeyStoreError::Parse {
107                path: path.to_path_buf(),
108                source,
109            })?;
110        if file.version != CURRENT_VERSION {
111            return Err(KeyStoreError::UnsupportedVersion(file.version));
112        }
113        // 检查重复 id
114        let mut seen = std::collections::HashSet::new();
115        for k in &file.keys {
116            if !seen.insert(k.id.clone()) {
117                return Err(KeyStoreError::DuplicateId(k.id.clone()));
118            }
119        }
120        // v1.4.104 external reviewer S-002 (P0) fix: load 时立即注入 fail-closed sentinel —
121        //
122        // 之前 expand_allowed_card_nums 只在 daemon 启动 + SIGHUP 跑, 但 MCP 等
123        // keystore consumer **不调** expand → 受 `allowed_card_nums` 限制的 key
124        // 加载后 `allowed_acc_ids = None`, 被限额引擎当作 "无限制" silent allow.
125        //
126        // **修法**: load_file 时对每条 key, 若 `allowed_card_nums` 非空但
127        // `allowed_acc_ids` 是 None / empty → 注入 sentinel `Some({0})`. 真实
128        // expansion (e.g. opend daemon GetAccList 之后) 会以 resolved acc_ids 覆盖.
129        // MCP 等不跑 expand 的消费方仍受 sentinel 保护 (fail-closed: real acc_id
130        // ≠ 0 → 永远 reject).
131        //
132        // 这是架构层 fix (与 v1.4.103 codex F1 P1 expand-time sentinel 同语义,
133        // 但提前到 load 时让所有 consumer 受益, 不依赖每个消费方都调 expand).
134        for rec in &mut file.keys {
135            // v1.4.106 F-P2-D: snapshot 文件源原始 allowed_acc_ids (sentinel
136            // 注入和 card_num expansion 之前). expand_allowed_card_nums 用
137            // 此字段作起步, 防止累积 stale resolutions.
138            rec.raw_explicit_acc_ids = rec.allowed_acc_ids.clone();
139
140            let has_card_nums = rec
141                .allowed_card_nums
142                .as_ref()
143                .is_some_and(|v| !v.is_empty());
144            let has_acc_ids = rec.allowed_acc_ids.as_ref().is_some_and(|s| !s.is_empty());
145            if has_card_nums && !has_acc_ids {
146                // 写 sentinel acc_id=0; expand_allowed_card_nums 后续会用
147                // resolved acc_ids 覆盖. 此期间任何 acc_id ≠ 0 的 query 全 reject.
148                let mut sentinel = rec.allowed_acc_ids.clone().unwrap_or_default();
149                sentinel.insert(0);
150                rec.allowed_acc_ids = Some(sentinel);
151                let key_id = crate::metrics::redact_key_id_for_logs(&rec.id);
152                let card_num_count = rec.allowed_card_nums.as_ref().map_or(0, Vec::len);
153                tracing::warn!(
154                    key_id = %key_id,
155                    card_num_count,
156                    "v1.4.104 external report S-002 (P0): keystore load 注入 fail-closed sentinel \
157                     allowed_acc_ids={{0}} (caller 配 allowed_card_nums 但 daemon 还没\
158                     expand). expand_allowed_card_nums 跑完后真实 resolved acc_ids 覆盖. \
159                     MCP / 不跑 expand 的 consumer 仍按 sentinel 保护."
160                );
161            }
162        }
163        Ok(file)
164    }
165
166    /// SIGHUP 热重载:用同一路径重新读文件
167    pub fn reload(&self) -> Result<(), KeyStoreError> {
168        let Some(path) = &self.path else {
169            return Ok(());
170        };
171        let file = Self::load_file(path)?;
172        let hash_index = Self::build_hash_index(&file);
173        self.current.store(Arc::new(file));
174        self.hash_index.store(Arc::new(hash_index));
175        Ok(())
176    }
177
178    /// v1.4.103 (B10): 把每条 key 的 `allowed_card_nums` (string format) 通过
179    /// `resolver` 解析成 acc_id, **合并**进 `allowed_acc_ids` (in-memory only,
180    /// 不写回 keys.json — 文件源不变, 重载后再 expand).
181    ///
182    /// `resolver(card_num) -> Vec<u64>` 由 caller 提供 (典型 closure 持
183    /// `Arc<TrdCache>` 调 `find_acc_ids_by_card_num`).
184    ///
185    /// **行为**:
186    /// - resolver 返 1 个 acc_id → 加入 allowed_acc_ids (resolved)
187    /// - 返 0 个 → 通过 `unresolved_callback` 通知 caller (e.g. log warn)
188    /// - 返 ≥ 2 个 → 通过 `ambiguous_callback` 通知 caller (loud, skip 该条)
189    ///
190    /// 返 `(resolved_count, unresolved_count, ambiguous_count)`.
191    ///
192    /// **典型调用 (daemon 启动 GetAccList 成功后)**:
193    /// ```ignore
194    /// let cache_clone = trd_cache.clone();
195    /// key_store.expand_allowed_card_nums(
196    ///     |cn: &str| cache_clone.find_acc_ids_by_card_num(cn),
197    ///     |key_id, cn| tracing::warn!(key_id, card_num=cn, "card_num not found"),
198    ///     |key_id, cn, candidates| tracing::warn!(key_id, card_num=cn, ?candidates, "ambiguous card_num"),
199    /// );
200    /// ```
201    pub fn expand_allowed_card_nums<R, FU, FA>(
202        &self,
203        resolver: R,
204        mut unresolved_callback: FU,
205        mut ambiguous_callback: FA,
206    ) -> (usize, usize, usize)
207    where
208        R: Fn(&str) -> Vec<u64>,
209        FU: FnMut(&str, &str),         // key_id, card_num
210        FA: FnMut(&str, &str, &[u64]), // key_id, card_num, candidates
211    {
212        let current = self.current.load();
213        let mut new_keys = current.keys.clone();
214        let mut resolved = 0;
215        let mut unresolved = 0;
216        let mut ambiguous = 0;
217        for rec in &mut new_keys {
218            let Some(card_nums) = rec.allowed_card_nums.clone() else {
219                continue;
220            };
221            // v1.4.106 F-P2-D: 从 raw_explicit_acc_ids 起步重新 resolve, 不
222            // 累积 stale resolutions. 之前 `rec.allowed_acc_ids.clone()` 起
223            // 步会让连续 expand 累积 — 若 keys.json 没动但 cache 里某 acc 不
224            // 再可见, 旧 resolved acc_id 仍留在 allowed set 中. 现在每次
225            // expand 都从 file 源原始集合重新计算. raw 为 None → 空集起步.
226            let mut acc_ids = rec.raw_explicit_acc_ids.clone().unwrap_or_default();
227            for cn in &card_nums {
228                let candidates = resolver(cn);
229                match candidates.len() {
230                    0 => {
231                        unresolved += 1;
232                        unresolved_callback(&rec.id, cn);
233                    }
234                    1 => {
235                        acc_ids.insert(candidates[0]);
236                        resolved += 1;
237                    }
238                    _ => {
239                        ambiguous += 1;
240                        ambiguous_callback(&rec.id, cn, &candidates);
241                    }
242                }
243            }
244            // v1.4.103 codex F1 (P1) fail-closed: 无论 acc_ids 是否非空, 都
245            // **必须** 写入 Some(...) — 哪怕是空 HashSet (= "denylist 全部",
246            // 限额引擎 step 0 acc_id 白名单非空 + 不含 ctx.acc_id → reject).
247            //
248            // 旧逻辑 (silent unrestricted): `if !acc_ids.is_empty() { rec.allowed_acc_ids = Some(acc_ids); }`
249            // 当 caller 配置 allowed_card_nums 但**全部 unresolved/ambiguous** 时,
250            // acc_ids 留空, allowed_acc_ids 仍 None → 限额引擎按 "无限制" 处理 →
251            // 受限 key silent unrestricted (反模式 D / pitfall #45).
252            //
253            // 新逻辑: 只要 caller 显式写了 allowed_card_nums (说明 *intent* 是限制),
254            // 就强制 Some(acc_ids) — 即便空集. 限额引擎检测到 Some(empty) 时
255            // 视为 "全 reject" (限额 step 0 `allowed.is_empty()` 已 short-circuit
256            // 不 reject, 但 contains check 永远 false → reject).
257            //
258            // **wait**: 看 limits.rs:332 `check_full_skip_rate`, step 0 是
259            // `if let (Some(allowed), Some(id)) = (&limits.allowed_acc_ids, ctx.acc_id) && !allowed.is_empty() && !allowed.contains(&id)`.
260            // 关键: `!allowed.is_empty()` short-circuit empty set, 等于 "无限制".
261            // 这就是 silent-unrestricted 的根源. 但 `allowed.is_empty()` 短路是
262            // **故意的语义** (允许 None / 空集都视为 "不限制") — 改这个会破坏
263            // 现有 user contract.
264            //
265            // **正确修法**: 既然空集语义 = 无限制不能改, 我们要把 caller intent
266            // (allowed_card_nums 非空) → 限额能识别 "想限但无法 resolve" 的状态.
267            // 选: 把 sentinel acc_id (e.g. 0) 写入 acc_ids 触发 reject — 因为
268            // 没有真账户 acc_id == 0. 限额检查时 acc_ids = {0}, ctx.acc_id =
269            // <real id> ≠ 0 → reject. legitimate id 也 reject — 这是
270            // fail-closed 保守语义.
271            if !card_nums.is_empty() {
272                if acc_ids.is_empty() {
273                    // 全部 unresolved/ambiguous → 写 sentinel 0 让限额 reject 一切
274                    acc_ids.insert(0u64);
275                }
276                rec.allowed_acc_ids = Some(acc_ids);
277            } else if !acc_ids.is_empty() {
278                rec.allowed_acc_ids = Some(acc_ids);
279            }
280        }
281        let file = KeysFile {
282            version: current.version,
283            keys: new_keys,
284        };
285        let hash_index = Self::build_hash_index(&file);
286        self.current.store(Arc::new(file));
287        self.hash_index.store(Arc::new(hash_index));
288        (resolved, unresolved, ambiguous)
289    }
290
291    /// 明文校验:遍历所有未过期 key,匹配则返回 KeyRecord 快照
292    ///
293    /// 如果 key 设置了 `allowed_machines` 且本机不在白名单,会打 warn 日志并视为未匹配。
294    /// 这样做法的代价:攻击者可以通过"能不能过"侧信道区分 key 是否存在 — 我们接受,
295    /// 因为 plaintext 空间是 256 bit 随机 hex,侧信道没意义。
296    pub fn verify(&self, plaintext: &str) -> Option<Arc<KeyRecord>> {
297        if !KeyRecord::is_generated_plaintext_shape(plaintext) {
298            return None;
299        }
300        let computed_hash = hash_plaintext(plaintext);
301        let index = self.hash_index.load();
302        let now = Utc::now();
303        let candidates = index.get(&computed_hash)?;
304        for k in candidates {
305            if k.is_expired(now) {
306                continue;
307            }
308            if let Err(e) = k.check_machine() {
309                let key_id = crate::metrics::redact_key_id_for_logs(&k.id);
310                tracing::warn!(
311                    key_id = %key_id,
312                    error = %e,
313                    "api key matched but machine binding failed; rejecting"
314                );
315                return None;
316            }
317            return Some(Arc::new(k.clone()));
318        }
319        None
320    }
321
322    /// 是否显式加载了 keys 文件。
323    ///
324    /// 注意:空 keys 文件也算 configured,用于让 surface 进入 scope mode
325    /// 并 fail-closed;只有 [`Self::empty`] 才代表 legacy/no-key 模式。
326    #[must_use]
327    pub fn is_configured(&self) -> bool {
328        self.path.is_some()
329    }
330
331    pub fn path(&self) -> Option<&Path> {
332        self.path.as_deref()
333    }
334
335    #[must_use]
336    pub fn len(&self) -> usize {
337        self.current.load().keys.len()
338    }
339
340    #[must_use]
341    pub fn is_empty(&self) -> bool {
342        self.len() == 0
343    }
344
345    /// v1.4.105 external reviewer #4 fix: 当前 KeyStore 是否有任意 key 配置了
346    /// `allowed_card_nums` 限制. 用于 standalone MCP / gRPC / 任何不持
347    /// `TrdCache` 的 keystore consumer 在启动时判断:
348    /// - `false` → 没有 card_num 限制, 跳过 daemon `GetAccList` + expand 全流程
349    ///   (避免无意义的 daemon 请求)
350    /// - `true` → 必须连 daemon, 调 `GetAccList`, 通过
351    ///   [`Self::expand_allowed_card_nums`] 把 card_num resolve 成 acc_id;
352    ///   否则 fail-closed sentinel `{0}` 会让所有真账户 reject (external reviewer BUG
353    ///   v1.4.104-002: standalone MCP 漏调 expand 导致 `0757` 配置的 key
354    ///   全 reject).
355    ///
356    /// 注意: 本方法只检查 raw `allowed_card_nums` 是否非空 — load_file 阶段
357    /// 注入的 sentinel `allowed_acc_ids = {0}` **不**算"已 expand"; 只有
358    /// caller 真跑过 [`Self::expand_allowed_card_nums`] 后才会用 resolved
359    /// acc_ids 覆盖 sentinel.
360    #[must_use]
361    pub fn has_any_card_num_restrictions(&self) -> bool {
362        self.current
363            .load()
364            .keys
365            .iter()
366            .any(|k| k.allowed_card_nums.as_ref().is_some_and(|v| !v.is_empty()))
367    }
368
369    /// 按 id 查询当前快照中的 key(**不做 expiry / machine 校验**,调用方自己做)
370    ///
371    /// 典型用法:MCP 在启动时 `verify(plaintext)` 拿到 id,后续每个请求用
372    /// `get_by_id` 取最新记录,这样 SIGHUP 重载 keys.json 后 scope / 限额 /
373    /// expires_at 的变更能立刻生效(不用重启进程)。
374    ///
375    /// 返回 None 表示 id 在当前文件里不存在(被 remove_key 删掉了),调用方应
376    /// 视为"key 已吊销"直接拒绝。
377    ///
378    /// **注意**:此方法**不做 machine binding 校验**。对于跨 SIGHUP 的 per-msg /
379    /// per-tool 复检场景应改用 [`Self::get_by_id_for_current_machine`],确保
380    /// SIGHUP 后新加的 `allowed_machines` 限制立即生效(避免 startup 验过 →
381    /// SIGHUP 收紧 → 仍按老 record 放行的语义漂移)。
382    pub fn get_by_id(&self, id: &str) -> Option<Arc<KeyRecord>> {
383        let snap = self.current.load_full();
384        snap.keys
385            .iter()
386            .find(|k| k.id == id)
387            .map(|k| Arc::new(k.clone()))
388    }
389
390    /// 按 id 查询当前快照中的 key + 立即校验本机 machine binding。
391    ///
392    /// **统一生命周期入口**: 任何 surface (WS / MCP / REST / gRPC) 在
393    /// 已 verify-once → 跨 SIGHUP 复检场景下应使用此方法替代裸 [`Self::get_by_id`],
394    /// 避免如下漂移:
395    ///
396    /// - startup 时 `verify(plaintext)` 检查 machine binding ✅
397    /// - SIGHUP reload 把该 key 的 `allowed_machines` 收紧(移除本机指纹)
398    /// - 后续 per-msg / per-tool 仅调 `get_by_id` → **绕过 machine binding** →
399    ///   silent unrestricted (反模式 D / pitfall #45 silent-success 同模式)
400    ///
401    /// 行为:
402    /// - id 不存在 → `None`(key 已被 remove_key 吊销,caller 视为吊销拒绝)
403    /// - id 存在 + machine 校验通过 → `Some(rec)`
404    /// - id 存在 + machine 校验失败 → `None` + warn log(与 `verify` 同语义)
405    ///
406    /// **不做 expiry 校验** —— pipeline.rs Step 1.5 / caller 自己做(与
407    /// `get_by_id` 行为对齐,仅差 machine 一层)。
408    pub fn get_by_id_for_current_machine(&self, id: &str) -> Option<Arc<KeyRecord>> {
409        let rec = self.get_by_id(id)?;
410        if let Err(e) = rec.check_machine() {
411            let key_id = crate::metrics::redact_key_id_for_logs(&rec.id);
412            tracing::warn!(
413                key_id = %key_id,
414                error = %e,
415                "api key get_by_id_for_current_machine: machine binding failed; \
416                 treating as revoked (caller should reject as if key not found)"
417            );
418            return None;
419        }
420        Some(rec)
421    }
422
423    /// 导出当前所有 keys 的 id(用于调试 / 审计)
424    #[must_use]
425    pub fn ids(&self) -> Vec<String> {
426        self.current
427            .load()
428            .keys
429            .iter()
430            .map(|k| k.id.clone())
431            .collect()
432    }
433}
434
435/// 追加一条新 key 到 keys.json(atomic rename)
436/// v1.4.106 codex 0558 F5 (P2): RMW (read-modify-write) helper that holds the
437/// flock for the **entire** sequence — load, mutate, write. Without this,
438/// concurrent append_key callers can load → load → write → write and lose
439/// the first writer's record.
440fn with_keys_lock<F, R>(path: &Path, f: F) -> Result<R, KeyStoreError>
441where
442    F: FnOnce(&Path) -> Result<R, KeyStoreError>,
443{
444    if let Some(parent) = path.parent()
445        && !parent.as_os_str().is_empty()
446    {
447        fs::create_dir_all(parent).map_err(|source| KeyStoreError::Write {
448            path: parent.to_path_buf(),
449            source,
450        })?;
451    }
452    let _guard = AdvisoryLockGuard::acquire_exclusive(path)?;
453    f(path)
454}
455
456pub fn append_key(path: &Path, new_record: KeyRecord) -> Result<(), KeyStoreError> {
457    with_keys_lock(path, |path| {
458        let mut file = match fs::metadata(path) {
459            Ok(_) => KeyStore::load_file_unlocked(path)?,
460            Err(_) => KeysFile {
461                version: CURRENT_VERSION,
462                keys: vec![],
463            },
464        };
465        if file.keys.iter().any(|k| k.id == new_record.id) {
466            return Err(KeyStoreError::DuplicateId(new_record.id));
467        }
468        file.keys.push(new_record);
469        write_atomic_inner(path, &file)
470    })
471}
472
473/// 读取 keys.json 并返回所有记录快照(展示用;不暴露 hash 以外的敏感位)
474pub fn list_keys(path: &Path) -> Result<Vec<KeyRecord>, KeyStoreError> {
475    let file = KeyStore::load_file(path)?;
476    Ok(file.keys)
477}
478
479/// 按 id 编辑一条 key(atomic rename);闭包返回 `false` 代表未改动 → 跳过落盘
480///
481/// 适用于就地修改 `allowed_machines` / `expires_at` / `note` 等配置,
482/// 而不想走 "revoke + regen" 流程(否则 plaintext 会换)。
483pub fn update_key<F>(path: &Path, id: &str, mutate: F) -> Result<bool, KeyStoreError>
484where
485    F: FnOnce(&mut KeyRecord) -> bool,
486{
487    // v1.4.106 F5: 整个 RMW 在 flock 内, 防 concurrent update_key 互相覆盖
488    with_keys_lock(path, |path| {
489        let mut file = KeyStore::load_file_unlocked(path)?;
490        let Some(rec) = file.keys.iter_mut().find(|k| k.id == id) else {
491            return Ok(false);
492        };
493        let changed = mutate(rec);
494        if changed {
495            write_atomic_inner(path, &file)?;
496        }
497        Ok(changed)
498    })
499}
500
501/// 按 id 删除一条 key(atomic rename);返回是否真的删掉了一条
502pub fn remove_key(path: &Path, id: &str) -> Result<bool, KeyStoreError> {
503    // v1.4.106 F5: 整个 RMW 在 flock 内
504    with_keys_lock(path, |path| {
505        let mut file = KeyStore::load_file_unlocked(path)?;
506        let before = file.keys.len();
507        file.keys.retain(|k| k.id != id);
508        let removed = before != file.keys.len();
509        if removed {
510            write_atomic_inner(path, &file)?;
511        }
512        Ok(removed)
513    })
514}
515
516/// v1.4.106 codex 0558 F5 (P2): atomic write with advisory flock + unique
517/// tempfile + fsync.
518///
519/// **背景**: 之前的 write_atomic 有 3 个问题:
520///   1. `tmp = path.with_extension("json.tmp")` — 同 path **每个 process 共
521///      享**, 并发写 (daemon + futucli + multiple admin reload) 会互相覆写
522///      tempfile, 最后 rename 时数据交错.
523///   2. **无 advisory flock** — 没有跨 process coordination, race 可让 reader
524///      看到部分写完的 `keys.json` (rename 前如果别的 process 也在 truncate).
525///   3. **无 fsync** — 写完立刻 rename 在 ext4 数据=writeback 模式 / SSD power
526///      loss 下可能丢内容 (file 在 inode 层面存在但 data block 没 flush).
527///
528/// **修法**: 1) tempfile 名带 pid + nanos: `keys.json.<pid>.<nanos>.tmp` — race-free.
529/// 2) 对 `keys.json.lock` (sibling lock file) 取 LOCK_EX flock; 读路径
530/// (load_file) 取 LOCK_SH; RMW 调用方在已经持有 LOCK_EX 时走
531/// `load_file_unlocked` 避免同线程嵌套锁. 3) tempfile open 后 write_all →
532/// sync_all → close → set_permissions → rename → 持有 lock 期间.
533///
534/// v1.4.106 codex 0558 F5: 写盘只做 unique tempfile + fsync + rename. flock 由
535/// caller (with_keys_lock) 在 RMW 范围统一持有, 这里**不再**单独加锁 (避免
536/// 与 with_keys_lock 重入).
537fn write_atomic_inner(path: &Path, file: &KeysFile) -> Result<(), KeyStoreError> {
538    let text = serde_json::to_string_pretty(file)?;
539
540    // v1.4.106 F5: unique tempfile (pid + nanos), 防 concurrent rename 战 tempfile.
541    let nanos = keystore_tempfile_nanos_or_zero();
542    let tmp_name = match path.file_name().and_then(|n| n.to_str()) {
543        Some(name) => format!(
544            "{name}.{pid}.{nanos}.tmp",
545            pid = std::process::id(),
546            nanos = nanos,
547        ),
548        None => format!(
549            "keys.{pid}.{nanos}.tmp",
550            pid = std::process::id(),
551            nanos = nanos,
552        ),
553    };
554    let tmp = path
555        .parent()
556        .map(|p| p.join(&tmp_name))
557        .unwrap_or_else(|| Path::new(&tmp_name).to_path_buf());
558
559    // 写入 tempfile + fsync — 0600 mode 通过 OpenOptions (Unix) 创建时即生效.
560    use std::io::Write;
561    #[cfg(unix)]
562    let mut f = {
563        use std::os::unix::fs::OpenOptionsExt;
564        fs::OpenOptions::new()
565            .create_new(true)
566            .write(true)
567            .mode(0o600)
568            .open(&tmp)
569            .map_err(|source| KeyStoreError::Write {
570                path: tmp.clone(),
571                source,
572            })?
573    };
574    #[cfg(not(unix))]
575    let mut f = fs::OpenOptions::new()
576        .create_new(true)
577        .write(true)
578        .open(&tmp)
579        .map_err(|source| KeyStoreError::Write {
580            path: tmp.clone(),
581            source,
582        })?;
583
584    let write_res = f
585        .write_all(text.as_bytes())
586        .and_then(|_| f.sync_all())
587        .map_err(|source| KeyStoreError::Write {
588            path: tmp.clone(),
589            source,
590        });
591    drop(f);
592
593    if let Err(e) = write_res {
594        if let Err(cleanup_err) = fs::remove_file(&tmp) {
595            tracing::debug!(
596                path = %tmp.display(),
597                error = %cleanup_err,
598                "keystore atomic write failed; tempfile cleanup also failed"
599            );
600        }
601        return Err(e);
602    }
603
604    // 防御 chmod (Unix), OpenOptions.mode 已 0600, 但部分 fs / umask 异常时兜底.
605    #[cfg(unix)]
606    {
607        use std::os::unix::fs::PermissionsExt;
608        if let Err(err) = fs::set_permissions(&tmp, fs::Permissions::from_mode(0o600)) {
609            tracing::warn!(
610                path = %tmp.display(),
611                error = %err,
612                "keystore tempfile chmod 0600 failed"
613            );
614        }
615    }
616
617    // atomic rename → tempfile 替换 inode, reader 看到的永远是完整 file.
618    fs::rename(&tmp, path).map_err(|source| KeyStoreError::Write {
619        path: path.to_path_buf(),
620        source,
621    })?;
622
623    Ok(())
624}
625
626fn keystore_tempfile_nanos_or_zero() -> u128 {
627    match std::time::SystemTime::now().duration_since(std::time::UNIX_EPOCH) {
628        Ok(duration) => duration.as_nanos(),
629        Err(err) => {
630            tracing::warn!(
631                error = ?err,
632                "keystore wall clock is before UNIX_EPOCH; falling back to zero tempfile timestamp"
633            );
634            0
635        }
636    }
637}
638
639// ============================================================================
640// v1.4.106 codex 0558 F5 (P2): advisory file lock helper (Unix flock-based)
641// ============================================================================
642//
643// 用 sibling `<path>.lock` 文件作锁文件 (空文件 just for inode), LOCK_EX 时
644// 阻塞其他 acquire_exclusive caller. drop 时 LOCK_UN 释放.
645//
646// 选择 sibling lock file (而非 lock keys.json 本身):
647//   - 避免 lock + rename 互动 (rename 替换 inode, 老 fd 上的 lock 实际作用于
648//     老 inode, 新 reader 拿不到锁信号)
649//   - 跨 process 一致 — 任何 process open `<path>.lock` 拿同 inode
650
651fn lock_path_for(path: &Path) -> PathBuf {
652    path.parent()
653        .map(|p| {
654            let mut name = path
655                .file_name()
656                .and_then(|n| n.to_str())
657                .unwrap_or("keys")
658                .to_string();
659            name.push_str(".lock");
660            p.join(name)
661        })
662        .unwrap_or_else(|| PathBuf::from("keys.lock"))
663}
664
665#[cfg(unix)]
666struct AdvisoryLockGuard {
667    fd: std::os::fd::OwnedFd,
668}
669
670#[cfg(unix)]
671impl AdvisoryLockGuard {
672    fn acquire_exclusive(path: &Path) -> Result<Self, KeyStoreError> {
673        Self::acquire(path, libc::LOCK_EX)
674    }
675
676    fn acquire_shared(path: &Path) -> Result<Self, KeyStoreError> {
677        Self::acquire(path, libc::LOCK_SH)
678    }
679
680    fn acquire(path: &Path, operation: i32) -> Result<Self, KeyStoreError> {
681        use std::os::fd::AsRawFd;
682        use std::os::unix::fs::OpenOptionsExt;
683
684        // sibling lock path: `<dir>/<file_name>.lock` (不复用
685        // path.with_extension — 它替换最后一个扩展名, 我们要追加一个新扩展名).
686        let lock_path = lock_path_for(path);
687
688        let file = fs::OpenOptions::new()
689            .create(true)
690            .read(true)
691            .write(true)
692            .truncate(false)
693            .mode(0o600)
694            .open(&lock_path)
695            .map_err(|source| KeyStoreError::Write {
696                path: lock_path.clone(),
697                source,
698            })?;
699        let raw = file.as_raw_fd();
700        // SAFETY: flock(2) is async-signal-safe; we hold OwnedFd via File
701        // until the guard drops, so fd is valid.
702        let rc = unsafe { libc::flock(raw, operation) };
703        if rc != 0 {
704            return Err(KeyStoreError::Write {
705                path: lock_path,
706                source: std::io::Error::last_os_error(),
707            });
708        }
709        let owned: std::os::fd::OwnedFd = file.into();
710        Ok(Self { fd: owned })
711    }
712}
713
714#[cfg(unix)]
715impl Drop for AdvisoryLockGuard {
716    fn drop(&mut self) {
717        use std::os::fd::AsRawFd;
718        // best-effort unlock on close; closing fd also implicitly releases lock.
719        // SAFETY: fd remains valid while owned by this guard. `flock(LOCK_UN)`
720        // only touches the live fd; if it fails, dropping the fd still releases
721        // the advisory lock as the final fallback.
722        let rc = unsafe { libc::flock(self.fd.as_raw_fd(), libc::LOCK_UN) };
723        if rc != 0 {
724            eprintln!(
725                "futu-auth warning: keystore advisory lock unlock failed: {}",
726                std::io::Error::last_os_error()
727            );
728        }
729    }
730}
731
732#[cfg(windows)]
733struct AdvisoryLockGuard {
734    file: fs::File,
735    overlapped: windows_sys::Win32::System::IO::OVERLAPPED,
736    lock_path: PathBuf,
737}
738
739#[cfg(windows)]
740impl AdvisoryLockGuard {
741    fn acquire_exclusive(path: &Path) -> Result<Self, KeyStoreError> {
742        use windows_sys::Win32::Storage::FileSystem::LOCKFILE_EXCLUSIVE_LOCK;
743
744        Self::acquire(path, LOCKFILE_EXCLUSIVE_LOCK)
745    }
746
747    fn acquire_shared(path: &Path) -> Result<Self, KeyStoreError> {
748        Self::acquire(path, 0)
749    }
750
751    fn acquire(path: &Path, flags: u32) -> Result<Self, KeyStoreError> {
752        use std::os::windows::io::AsRawHandle;
753        use windows_sys::Win32::Storage::FileSystem::LockFileEx;
754        use windows_sys::Win32::System::IO::OVERLAPPED;
755
756        let lock_path = lock_path_for(path);
757        let file = fs::OpenOptions::new()
758            .create(true)
759            .read(true)
760            .write(true)
761            .truncate(false)
762            .open(&lock_path)
763            .map_err(|source| KeyStoreError::Write {
764                path: lock_path.clone(),
765                source,
766            })?;
767
768        let mut overlapped = OVERLAPPED::default();
769        let raw = file.as_raw_handle() as windows_sys::Win32::Foundation::HANDLE;
770        // SAFETY: `raw` is the live handle owned by `file`; `overlapped` is kept
771        // inside the guard and reused for UnlockFileEx on drop. Lock length
772        // covers the full file range from offset 0, matching Unix sibling-lock
773        // semantics for the whole RMW critical section.
774        let ok = unsafe { LockFileEx(raw, flags, 0, u32::MAX, u32::MAX, &mut overlapped) };
775        if ok == 0 {
776            return Err(KeyStoreError::Write {
777                path: lock_path,
778                source: std::io::Error::last_os_error(),
779            });
780        }
781
782        Ok(Self {
783            file,
784            overlapped,
785            lock_path,
786        })
787    }
788}
789
790#[cfg(windows)]
791impl Drop for AdvisoryLockGuard {
792    fn drop(&mut self) {
793        use std::os::windows::io::AsRawHandle;
794        use windows_sys::Win32::Storage::FileSystem::UnlockFileEx;
795
796        let raw = self.file.as_raw_handle() as windows_sys::Win32::Foundation::HANDLE;
797        // SAFETY: `raw` remains valid while `file` is held by this guard, and
798        // `overlapped` is the same offset structure used for LockFileEx.
799        let ok = unsafe { UnlockFileEx(raw, 0, u32::MAX, u32::MAX, &mut self.overlapped) };
800        if ok == 0 {
801            eprintln!(
802                "futu-auth warning: keystore advisory lock unlock failed for {}: {}",
803                self.lock_path.display(),
804                std::io::Error::last_os_error()
805            );
806        }
807    }
808}
809
810#[cfg(all(not(unix), not(windows)))]
811struct AdvisoryLockGuard;
812
813#[cfg(all(not(unix), not(windows)))]
814impl AdvisoryLockGuard {
815    fn acquire_exclusive(_path: &Path) -> Result<Self, KeyStoreError> {
816        // Unknown platform fallback. Unix and Windows have real cross-process
817        // advisory locks; other targets keep legacy single-process behavior.
818        Ok(Self)
819    }
820
821    fn acquire_shared(_path: &Path) -> Result<Self, KeyStoreError> {
822        // Unknown platform fallback. Unix and Windows have real cross-process
823        // advisory locks; other targets keep legacy single-process behavior.
824        Ok(Self)
825    }
826}
827
828#[cfg(test)]
829mod tests;