You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

57 lines
1.5 KiB
Lua

-- https://github.com/sundream/crab
local crab = require "crab.c"
local utf8 = require "utf8.c"
local words = {}
for line in io.lines("words.txt") do
local t = {}
assert(utf8.toutf32(line, t), "non utf8 words detected:"..line)
table.insert(words, t)
end
local filter = crab.open(words)
local input = io.input("texts.txt"):read("*a")
local texts = {}
assert(utf8.toutf32(input, texts), "non utf8 words detected:", texts)
filter:filter(texts)
local output = utf8.toutf8(texts)
print(output)
-- 将替换字符改成'#',过滤区间排除前后60个字
-- 另外敏感字在排除单词集中不被替换
words = {"一定","无论遇到"}
for i,word in ipairs(words) do
local t = {}
assert(utf8.toutf32(word, t), "non utf8 words detected:"..word)
words[i] = t
end
-- crab.new is alias of crab.open
local excluder = crab.new(words)
print("default replace_rune:",filter:replace_rune())
filter:replace_rune(0x23)
print("new replace_rune:",filter:replace_rune())
texts = {}
assert(utf8.toutf32(input, texts), "non utf8 words detected:", texts)
local from = 60
local to = #texts - 60
local filter_range = {}
local start = from
local found,pos1,pos2 = excluder:next(texts,start,to)
while found do
if pos1 > start then
table.insert(filter_range,{start,pos1-1})
end
start = pos2 + 1
found,pos1,pos2 = excluder:next(texts,start,to);
end
table.insert(filter_range,{start,to})
for _,t in ipairs(filter_range) do
filter:filter(texts,t[1],t[2]);
end
output = utf8.toutf8(texts)
print(output)