近期在搞一个敏感词服务,由于敏感词词库量比较大,敏感词服务载重载过程中需要对词做分词和词频解析,时间比较久,导致服务重载敏感词期间,敏感词请求检测会出现bug,需要修复必须要保证提供服务的敏感词加载要是完整的情况下才可以正式替换加载。

原代码

package main

import (
	"encoding/json"
	"flag"
	"fmt"
	"git.xxx.com/pepper/sego"
	"github.com/fvbock/endless"
	"github.com/robfig/cron"
	"io"
	"log"
	"net/http"
	"os"
	"runtime"
	"strings"
	"time"
)

var (
	host           = flag.String("host", "", "HTTP服务器主机名")
	port           = flag.Int("port", 80, "HTTP服务器端口")
	dict           = flag.String("dict", "/home/q/system/xxx/xxx_sego/test/dictionary.txt", "词典文件")
	reloadInterval = flag.String("reloadInterval", "1m", "自动重启周期间隔")
	//staticFolder = flag.String("static_folder", "/home/q/system/xxx/go/src/github.com/pakrchen/sego/server/static", "静态页面存放的目录")
	segmenter = sego.Segmenter{}

	logFile = "/data/nginx/logs/api.xxx.corp.qihoo.net/app/segoserver.log"
)

func init() {
	flag.Parse()

	//设置cron
	c := cron.New()
	_ = c.AddFunc("@every "+*reloadInterval, reloadDict)
	c.Start()
}

func reloadDict() {
	logToFile(logFile, "reload "+ fmt.Sprintf("%d", *port) +" start interval : "+*reloadInterval+" "+time.Now().Format("2006/01/02 15:04:05"))
	segmenter.LoadDictionary(*dict)
	logToFile(logFile, "reload "+ fmt.Sprintf("%d", *port) +" end :"+*reloadInterval+" "+time.Now().Format("2006/01/02 15:04:05"))
}

func logToFile(filename string, v ...interface{}) {
	f, err := os.OpenFile(filename, os.O_RDWR|os.O_CREATE|os.O_APPEND, 0666)
	if err != nil {
		fmt.Println(err)
	}
	// defer f.Close()

	log.SetOutput(f)
	log.Println(v)
	_ = f.Close()
}

func HitFilter(text string, words []map[string]string) (map[string][]string, bool) {
	hitMap := make(map[string][]string)
	length := len(words)
	hit := false
	for i := length - 1; i >= 0; i-- {
		tmpMap := strings.Split(words[i]["Pos"], "|")
		for j := 0; j < len(tmpMap); j++ {
			hitMap[tmpMap[j]] = append(hitMap[tmpMap[j]], words[i]["Text"])
		}
	}

	if len(hitMap) > 0 {
		hit = true
	}

	return hitMap, hit
}

func JsonRpcServer(w http.ResponseWriter, req *http.Request) {
	// 得到要分词的文本
	text := req.URL.Query().Get("text")
	if text == "" {
		text = req.PostFormValue("text")
	}
	// 支持searchMode
	mode := req.URL.Query().Get("mode")
	searchMode := false
	if mode == "search" {
		searchMode = true
	}

	// 分词
	segments := segmenter.Segment([]byte(text))
	words := sego.SegmentsToSliceWithPos(segments, searchMode)
	hitMap, hit := HitFilter(text, words)
	response, _ := json.Marshal(map[string]interface{}{"Segment": words, "Hit": hit, "HitMap": hitMap})
	logToFile(logFile, "ip: ", req.RemoteAddr, "\t", text, "\t", string(response[:]))

	w.Header().Set("Content-Type", "application/json")
	_, _ = io.WriteString(w, string(response))
}

func main() {
	// 将线程数设置为CPU数
	runtime.GOMAXPROCS(runtime.NumCPU())

	// 初始化分词器
	segmenter.LoadDictionary(*dict)

	http.HandleFunc("/json", JsonRpcServer)
	//http.Handle("/", http.FileServer(http.Dir(*staticFolder)))
	log.Print("服务器启动")
	_ = endless.ListenAndServe(fmt.Sprintf("%s:%d", *host, *port), nil)
}


修正后采用指针切换形式保证加载完成后在载入新的词库

package main

import (
	"encoding/json"
	"flag"
	"fmt"
	"git.xxx.com/pepper/sego"
	"github.com/fvbock/endless"
	"github.com/robfig/cron"
	"io"
	"log"
	"net/http"
	"os"
	"runtime"
	"strings"
	"sync/atomic"
	"time"
	"unsafe"
)

var (
	host           = flag.String("host", "", "HTTP服务器主机名")
	port           = flag.Int("port", 80, "HTTP服务器端口")
	dict           = flag.String("dict", "/home/q/system/xxx/xxx_sego/test/dictionary.txt", "词典文件")
	reloadInterval = flag.String("reloadInterval", "1m", "自动重启周期间隔")

	logFile = "/data/nginx/logs/api.data.xxx.com/app/segoserver.log"
)

//可切换的指针值
var segmenter = unsafe.Pointer(nil)

//获取指针值
func getSego() *sego.Segmenter {
	tmp := (*sego.Segmenter)(atomic.LoadPointer(&segmenter))
	return tmp
}

//保存指针值
func storeSego(m *sego.Segmenter) {
	atomic.StorePointer(&segmenter, unsafe.Pointer(m))
}

func init() {
	flag.Parse()

	//设置cron
	c := cron.New()
	_ = c.AddFunc("@every "+*reloadInterval, reloadDict)
	c.Start()
}

func reloadDict() {
	logToFile(logFile, "reload "+ fmt.Sprintf("%d", *port) +" start interval : "+*reloadInterval+" "+time.Now().Format("2006/01/02 15:04:05"))
	segmenter:=sego.Segmenter{}
    segmenter.LoadDictionary(*dict)
    //新词库加载完波后更新指针值
	storeSego(&segmenter)
	logToFile(logFile, "reload "+ fmt.Sprintf("%d", *port) +" end :"+*reloadInterval+" "+time.Now().Format("2006/01/02 15:04:05"))
}

func logToFile(filename string, v ...interface{}) {
	f, err := os.OpenFile(filename, os.O_RDWR|os.O_CREATE|os.O_APPEND, 0666)
	if err != nil {
		fmt.Println(err)
	}
	// defer f.Close()

	log.SetOutput(f)
	log.Println(v)
	_ = f.Close()
}

func HitFilter(text string, words []map[string]string) (map[string][]string, bool) {
	hitMap := make(map[string][]string)
	length := len(words)
	hit := false
	for i := length - 1; i >= 0; i-- {
		tmpMap := strings.Split(words[i]["Pos"], "|")
		for j := 0; j < len(tmpMap); j++ {
			hitMap[tmpMap[j]] = append(hitMap[tmpMap[j]], words[i]["Text"])
		}
	}

	if len(hitMap) > 0 {
		hit = true
	}

	return hitMap, hit
}

func JsonRpcServer(w http.ResponseWriter, req *http.Request) {
	// 得到要分词的文本
	text := req.URL.Query().Get("text")
	if text == "" {
		text = req.PostFormValue("text")
	}
	// 支持searchMode
	mode := req.URL.Query().Get("mode")
	searchMode := false
	if mode == "search" {
		searchMode = true
	}

	// 分词检测,获取已经加载完成的指针值使用
	segmenter := getSego()
	segments := segmenter.Segment([]byte(text))
	words := sego.SegmentsToSliceWithPos(segments, searchMode)
	hitMap, hit := HitFilter(text, words)
	response, _ := json.Marshal(map[string]interface{}{"Segment": words, "Hit": hit, "HitMap": hitMap})
	logToFile(logFile, "ip: ", req.RemoteAddr, "\t", text, "\t", string(response[:]))

	w.Header().Set("Content-Type", "application/json")
	_, _ = io.WriteString(w, string(response))
}

func main() {


	// 将线程数设置为CPU数
	runtime.GOMAXPROCS(runtime.NumCPU())

	// 初始化分词器
	reloadDict()

	http.HandleFunc("/json", JsonRpcServer)
	//http.Handle("/", http.FileServer(http.Dir(*staticFolder)))
	log.Print("服务器启动")
	_ = endless.ListenAndServe(fmt.Sprintf("%s:%d", *host, *port), nil)
}