mirror of
https://gitlab.alpinelinux.org/alpine/aports.git
synced 2026-01-23 01:21:46 +01:00
This buildsystem is so easy to get wrong and so hard to get right, just import all the work done by Debian.
73 lines
2.8 KiB
Diff
73 lines
2.8 KiB
Diff
From fd9eaf10c49239d700af848062acc1d5efd54aa8 Mon Sep 17 00:00:00 2001
|
|
From: liuzhangjian <liuzhangjian@uniontech.com>
|
|
Date: Fri, 4 Dec 2020 15:41:31 +0800
|
|
Subject: [PATCH] Title:fix a bug of ChineseTokenizer
|
|
|
|
Description:When I use ChineseAnalyzer for Chinese word segmentation, I find that English and numbers are treated as one word and I think they should be separated.
|
|
|
|
RootCause:Null
|
|
|
|
Solution:
|
|
---
|
|
.../common/analysis/cn/ChineseFilter.cpp | 2 +-
|
|
.../common/analysis/cn/ChineseTokenizer.cpp | 22 ++++++++++++++++++-
|
|
2 files changed, 22 insertions(+), 2 deletions(-)
|
|
|
|
diff --git a/src/contrib/analyzers/common/analysis/cn/ChineseFilter.cpp b/src/contrib/analyzers/common/analysis/cn/ChineseFilter.cpp
|
|
index d2a19f3f..83134454 100644
|
|
--- a/src/contrib/analyzers/common/analysis/cn/ChineseFilter.cpp
|
|
+++ b/src/contrib/analyzers/common/analysis/cn/ChineseFilter.cpp
|
|
@@ -38,7 +38,7 @@ bool ChineseFilter::incrementToken() {
|
|
if (text.length() > 1) {
|
|
return true;
|
|
}
|
|
- } else if (UnicodeUtil::isOther(text[0])) {
|
|
+ } else if (UnicodeUtil::isOther(text[0]) || UnicodeUtil::isDigit(text[0])) {
|
|
// One Chinese character as one Chinese word.
|
|
// Chinese word extraction to be added later here.
|
|
return true;
|
|
diff --git a/src/contrib/analyzers/common/analysis/cn/ChineseTokenizer.cpp b/src/contrib/analyzers/common/analysis/cn/ChineseTokenizer.cpp
|
|
index 38bf9875..3b4de742 100644
|
|
--- a/src/contrib/analyzers/common/analysis/cn/ChineseTokenizer.cpp
|
|
+++ b/src/contrib/analyzers/common/analysis/cn/ChineseTokenizer.cpp
|
|
@@ -65,6 +65,7 @@ bool ChineseTokenizer::incrementToken() {
|
|
|
|
length = 0;
|
|
start = offset;
|
|
+ bool last_is_en = false, last_is_num = false;
|
|
|
|
while (true) {
|
|
wchar_t c;
|
|
@@ -82,11 +83,30 @@ bool ChineseTokenizer::incrementToken() {
|
|
c = ioBuffer[bufferIndex++];
|
|
}
|
|
|
|
- if (UnicodeUtil::isDigit(c) || UnicodeUtil::isLower(c) || UnicodeUtil::isUpper(c)) {
|
|
+ if (UnicodeUtil::isLower(c) || UnicodeUtil::isUpper(c)) {
|
|
+ if (last_is_num) {
|
|
+ --bufferIndex;
|
|
+ --offset;
|
|
+ return flush();
|
|
+ }
|
|
+
|
|
+ push(c);
|
|
+ if (length == MAX_WORD_LEN) {
|
|
+ return flush();
|
|
+ }
|
|
+ last_is_en = true;
|
|
+ } else if (UnicodeUtil::isDigit(c)) {
|
|
+ if (last_is_en) {
|
|
+ --bufferIndex;
|
|
+ --offset;
|
|
+ return flush();
|
|
+ }
|
|
+
|
|
push(c);
|
|
if (length == MAX_WORD_LEN) {
|
|
return flush();
|
|
}
|
|
+ last_is_num = true;
|
|
} else if (UnicodeUtil::isOther(c)) {
|
|
if (length > 0) {
|
|
--bufferIndex;
|