package main import ( "errors" "io" "log" "os" "path/filepath" "strings" "unicode" "golang.org/x/text/transform" "github.com/yasushi-saito/rbtree" ) type InputIndex []IndexEntry func NewInputIndex(option *InputOptions, style *InputStyle) *InputIndex { inset := rbtree.NewTree(CompareIndexEntry) if option.stdin { readIdxFile(inset, os.Stdin, option, style) } else { for _, idxname := range option.input { // 文件不存在且无后缀时,加上默认后缀 .idx 再试 if _, err := os.Stat(idxname); os.IsNotExist(err) && filepath.Ext(idxname) == "" { idxname = idxname + ".idx" } idxfile, err := os.Open(idxname) if err != nil { log.Fatalln(err.Error()) } readIdxFile(inset, idxfile, option, style) idxfile.Close() } } var in InputIndex for iter := inset.Min(); !iter.Limit(); iter = iter.Next() { pentry := iter.Item().(*IndexEntry) in = append(in, *pentry) } return &in } func readIdxFile(inset *rbtree.Tree, idxfile *os.File, option *InputOptions, style *InputStyle) { log.Printf("读取输入文件 %s ……\n", idxfile.Name()) accepted, rejected := 0, 0 idxreader := NewNumberdReader(transform.NewReader(idxfile, option.decoder)) for { entry, err := ScanIndexEntry(idxreader, option, style) if err == io.EOF { break } else if err == ScanSyntaxError { rejected++ log.Printf("%s:%d: %s\n", idxfile.Name(), idxreader.Line(), err.Error()) // 跳过一行 if err := idxreader.SkipLine(); err == io.EOF { break } else if err != nil { log.Fatalln(err.Error()) } } else if err != nil { log.Fatalln(err.Error()) } else { accepted++ if old := inset.Get(entry); old != nil { oldentry := old.(*IndexEntry) oldentry.pagelist = append(oldentry.pagelist, entry.pagelist...) } else { // entry 不在集合 inset 中时,插入 entry 本身和所有祖先节点,祖先不含页码 for len(entry.level) > 0 { inset.Insert(entry) parent := &IndexEntry{ level: entry.level[:len(entry.level)-1], pagelist: nil, } if inset.Get(parent) != nil { break } else { entry = parent } } } } } log.Printf("接受 %d 项,拒绝 %d 项。\n", accepted, rejected) } // 跳过空白符和行注释 func skipspaces(reader *NumberdReader, style *InputStyle) error { for { r, _, err := reader.ReadRune() if err != nil { return err } else if r == style.comment { // 注释以 style.comment 开头,直至行末 reader.SkipLine() } else if !unicode.IsSpace(r) { reader.UnreadRune() break } } return nil } func ScanIndexEntry(reader *NumberdReader, option *InputOptions, style *InputStyle) (*IndexEntry, error) { var entry IndexEntry page := new(Page) // 跳过空白符 if err := skipspaces(reader, style); err != nil { return nil, err } // 跳过 keyword for _, r := range style.keyword { new_r, _, err := reader.ReadRune() if err != nil { return nil, err } if new_r != r { return nil, ScanSyntaxError } } // 跳过空白符 if err := skipspaces(reader, style); err != nil { return nil, err } // 自动机状态 const ( SCAN_OPEN = iota SCAN_KEY SCAN_VALUE SCAN_COMMAND SCAN_PAGE SCAN_PAGERANGE ) // 从 arg_open 开始扫描到 arg_close,处理索引项 state := SCAN_OPEN quoted := false escaped := false arg_depth := 0 var token []rune var entry_input []rune page.rangetype = PAGE_NORMAL L_scan_kv: for { r, _, err := reader.ReadRune() entry_input = append(entry_input, r) if err != nil { return nil, err } //debug.Printf("字符 %2c, 状态 %d, quoted %5v, escaped %5v, arg_depth %d\n", r, state, quoted, escaped, arg_depth) //// DEBUG only switch state { case SCAN_OPEN: if !quoted && r == style.arg_open { state = SCAN_KEY } else { return nil, ScanSyntaxError } case SCAN_KEY: push_keyval := func(next int) { str := string(token) if option.compress { str = strings.TrimSpace(str) } entry.level = append(entry.level, IndexEntryLevel{key: str, text: str}) token = nil state = next } if quoted { token = append(token, r) quoted = false break } else if r == style.arg_open && !escaped { token = append(token, r) arg_depth++ } else if r == style.arg_close && !escaped { if arg_depth == 0 { push_keyval(0) break L_scan_kv } else { token = append(token, r) arg_depth-- } } else if r == style.actual { push_keyval(SCAN_VALUE) } else if r == style.encap { push_keyval(SCAN_PAGERANGE) } else if r == style.level { push_keyval(SCAN_KEY) } else if r == style.quote && !escaped { quoted = true } else { token = append(token, r) } if r == style.escape { escaped = true } else { escaped = false } case SCAN_VALUE: set_value := func(next int) { str := string(token) entry.level[len(entry.level)-1].text = str token = nil state = next } // 暂不对 actual 特殊处理 if quoted { token = append(token, r) quoted = false break } else if r == style.arg_open && !escaped { token = append(token, r) arg_depth++ } else if r == style.arg_close && !escaped { if arg_depth == 0 { set_value(0) break L_scan_kv } else { token = append(token, r) arg_depth-- } } else if r == style.encap { set_value(SCAN_PAGERANGE) } else if r == style.level { set_value(SCAN_KEY) } else if r == style.quote && !escaped { quoted = true } else { token = append(token, r) } if r == style.escape { escaped = true } else { escaped = false } case SCAN_PAGERANGE: if quoted { token = append(token, r) quoted = false break } else if r == style.arg_open || r == style.arg_close || r == style.actual || r == style.encap || r == style.level { // 注意 encap 符号后不能直接加 arg_open、arg_close 等符号 return nil, ScanSyntaxError } else if r == style.range_open { page.rangetype = PAGE_OPEN } else if r == style.range_close { page.rangetype = PAGE_CLOSE } else if r == style.quote { // 之前是 encap,无须考虑被 escape 转义 quoted = true } else { token = append(token, r) } state = SCAN_COMMAND if r == style.escape { escaped = true } else { escaped = false } case SCAN_COMMAND: // 不对 encap、actual、level 特殊处理 if quoted { token = append(token, r) quoted = false break } else if r == style.arg_open && !escaped { token = append(token, r) arg_depth++ } else if r == style.arg_close && !escaped { if arg_depth == 0 { page.encap = string(token) break L_scan_kv } else { token = append(token, r) arg_depth-- } } else if r == style.quote && !escaped { quoted = true } else { token = append(token, r) } if r == style.escape { escaped = true } else { escaped = false } default: panic("扫描状态错误") } } entry.input = string(entry_input) // 跳过空白符 if err := skipspaces(reader, style); err != nil { return nil, err } // 从 arg_open 开始扫描到 arg_close,处理页码 state = SCAN_OPEN token = nil L_scan_page: for { r, _, err := reader.ReadRune() if err != nil { return nil, err } // debug.Printf("字符 %c, 状态 %d\n", r, state) //// DEBUG only switch state { case SCAN_OPEN: if r == style.arg_open { state = SCAN_PAGE } else { return nil, ScanSyntaxError } case SCAN_PAGE: if r == style.arg_close { page.numbers, err = scanPage(token, style.page_compositor) if err != nil { return nil, err } break L_scan_page } else if r == style.arg_open { return nil, ScanSyntaxError } else { token = append(token, r) } default: panic("扫描状态错误") } // 未实现对 style.page_compositor 的处理 } page.compositor = style.page_compositor entry.pagelist = append(entry.pagelist, page) // debug.Println(entry) //// DEBUG only return &entry, nil } var ScanSyntaxError = errors.New("索引项语法错误") type IndexEntry struct { input string level []IndexEntryLevel pagelist []*Page } // 实现 rbtree.CompareFunc func CompareIndexEntry(a, b rbtree.Item) int { x := a.(*IndexEntry) y := b.(*IndexEntry) for i := range x.level { if i >= len(y.level) { return 1 } if x.level[i].key < y.level[i].key { return -1 } else if x.level[i].key > y.level[i].key { return 1 } if x.level[i].text < y.level[i].text { return -1 } else if x.level[i].text > y.level[i].text { return 1 } } if len(x.level) < len(y.level) { return -1 } return 0 } // 一条索引条目中的一级 type IndexEntryLevel struct { key string text string } type RangeType int const ( PAGE_UNKNOWN RangeType = iota PAGE_OPEN PAGE_NORMAL PAGE_CLOSE ) func (rt RangeType) String() string { switch rt { case PAGE_UNKNOWN: return "?" case PAGE_OPEN: return "(" case PAGE_NORMAL: return "." case PAGE_CLOSE: return ")" default: panic("区间格式错误") } }