You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
57 lines
1.5 KiB
Lua
57 lines
1.5 KiB
Lua
-- https://github.com/sundream/crab
|
|
|
|
local crab = require "crab.c"
|
|
local utf8 = require "utf8.c"
|
|
|
|
local words = {}
|
|
for line in io.lines("words.txt") do
|
|
local t = {}
|
|
assert(utf8.toutf32(line, t), "non utf8 words detected:"..line)
|
|
table.insert(words, t)
|
|
end
|
|
|
|
local filter = crab.open(words)
|
|
local input = io.input("texts.txt"):read("*a")
|
|
local texts = {}
|
|
assert(utf8.toutf32(input, texts), "non utf8 words detected:", texts)
|
|
filter:filter(texts)
|
|
local output = utf8.toutf8(texts)
|
|
print(output)
|
|
|
|
-- 将替换字符改成'#',过滤区间排除前后60个字
|
|
-- 另外敏感字在排除单词集中不被替换
|
|
words = {"一定","无论遇到"}
|
|
for i,word in ipairs(words) do
|
|
local t = {}
|
|
assert(utf8.toutf32(word, t), "non utf8 words detected:"..word)
|
|
words[i] = t
|
|
end
|
|
-- crab.new is alias of crab.open
|
|
local excluder = crab.new(words)
|
|
print("default replace_rune:",filter:replace_rune())
|
|
filter:replace_rune(0x23)
|
|
print("new replace_rune:",filter:replace_rune())
|
|
|
|
texts = {}
|
|
assert(utf8.toutf32(input, texts), "non utf8 words detected:", texts)
|
|
local from = 60
|
|
local to = #texts - 60
|
|
|
|
local filter_range = {}
|
|
local start = from
|
|
local found,pos1,pos2 = excluder:next(texts,start,to)
|
|
while found do
|
|
if pos1 > start then
|
|
table.insert(filter_range,{start,pos1-1})
|
|
end
|
|
start = pos2 + 1
|
|
found,pos1,pos2 = excluder:next(texts,start,to);
|
|
end
|
|
table.insert(filter_range,{start,to})
|
|
|
|
for _,t in ipairs(filter_range) do
|
|
filter:filter(texts,t[1],t[2]);
|
|
end
|
|
output = utf8.toutf8(texts)
|
|
print(output)
|