Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

simdutf: simdutf_connector: in_tail: Implement UTF-16LE/UTF-16BE encoder #9468

Open
wants to merge 21 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
676f7f2
build: lib: Bundle simdutf amalgamation v5.5.0
cosmo0920 Sep 27, 2024
6f6ec53
simdutf_connector: Add C connector for simdutf library
cosmo0920 Sep 27, 2024
8e18853
in_tail: Add Unicode encoder support
cosmo0920 Oct 3, 2024
1a58bc8
in_tail: tests: Add UTF-16LE and UTF-16BE with BOM test cases for uni…
cosmo0920 Oct 4, 2024
f73690a
simdutf_connector: Make buildable on macOS
cosmo0920 Oct 4, 2024
99f62d7
build: Do not install simdutf related header when turning off
cosmo0920 Oct 4, 2024
a5e397b
in_tail: tests: Add testcases for subdivision flags characters
cosmo0920 Oct 7, 2024
1ffecf6
in_tail: Treat as an error when unsupported encoding is specified
cosmo0920 Oct 7, 2024
e34b3c7
workflows: Use g++ or clang++ for C++ source buildings and linkings
cosmo0920 Oct 7, 2024
b785ab7
workflows: Use g++ for C++ sources on system libs task
cosmo0920 Oct 7, 2024
9e1de85
build: Add CXX flags for gcov
cosmo0920 Oct 7, 2024
e35657c
in_tail: Fix indentation style
cosmo0920 Oct 8, 2024
cd810d2
simdutf_connector: Suppress compiler warnings
cosmo0920 Oct 8, 2024
4291674
in_tail: Accept more variants for specifying UTF-16s
cosmo0920 Oct 9, 2024
8a8fb04
simdutf_connector: Handle newlines
cosmo0920 Oct 9, 2024
330e4ef
in_tail: Convert encodings before splitting lines
cosmo0920 Oct 10, 2024
21ba01d
in_tail: Align 2-bytes alignments if UTF-16 encodings are enabled
cosmo0920 Oct 10, 2024
19a0a1a
dockerfiles: centos-7: Disable SIMDUTF module due to gcc-4 series doe…
cosmo0920 Oct 28, 2024
36537a4
packaging: distros: centos: Disble simdutf stuffs on CentOS 7
cosmo0920 Oct 28, 2024
34c3cf6
packaging: Detect centos/6 or centos/7 to turn off simdutf stuffs
cosmo0920 Oct 28, 2024
189756d
packaging: Turn off simdutf on centos/7 ARM64bit
cosmo0920 Oct 28, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 9 additions & 5 deletions .github/workflows/pr-compile-check.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,12 @@ jobs:
flb_option:
- "-DFLB_PREFER_SYSTEM_LIBS=On"
compiler:
- gcc
- clang
- gcc:
cc: gcc
cxx: g++
- clang:
cc: clang
cxx: clang++
steps:
- name: Setup environment
run: |
Expand All @@ -53,7 +57,7 @@ jobs:
- name: Checkout Fluent Bit code
uses: actions/checkout@v4

- name: ${{ matrix.compiler }} - ${{ matrix.flb_option }}
- name: ${{ matrix.compiler.cc }} & ${{ matrix.compiler.cxx }} - ${{ matrix.flb_option }}
run: |
export nparallel=$(( $(getconf _NPROCESSORS_ONLN) > 8 ? 8 : $(getconf _NPROCESSORS_ONLN) ))
echo "CC = $CC, CXX = $CXX, FLB_OPT = $FLB_OPT"
Expand All @@ -64,8 +68,8 @@ jobs:
make -j $nparallel
working-directory: build
env:
CC: ${{ matrix.compiler }}
CXX: ${{ matrix.compiler }}
CC: ${{ matrix.compiler.cc }}
CXX: ${{ matrix.compiler.cxx }}
FLB_OPT: ${{ matrix.flb_option }}
GLOBAL_OPTS: "-DFLB_BACKTRACE=Off -DFLB_SHARED_LIB=Off -DFLB_DEBUG=On -DFLB_ALL=On -DFLB_EXAMPLES=Off"

Expand Down
29 changes: 18 additions & 11 deletions .github/workflows/unit-tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -43,11 +43,17 @@ jobs:
- "-DFLB_SANITIZE_MEMORY=On"
- "-DFLB_SANITIZE_THREAD=On"
compiler:
- gcc
- clang
- gcc:
cc: gcc
cxx: g++
- clang:
cc: clang
cxx: clang++
exclude:
- flb_option: "-DFLB_COVERAGE=On"
compiler: clang
compiler:
cc: clang
cxx: clang++
permissions:
contents: read
steps:
Expand All @@ -64,7 +70,7 @@ jobs:
repository: calyptia/fluent-bit-ci
path: ci

- name: ${{ matrix.compiler }} - ${{ matrix.flb_option }}
- name: ${{ matrix.compiler.cc }} & ${{ matrix.compiler.cxx }} - ${{ matrix.flb_option }}
run: |
echo "CC = $CC, CXX = $CXX, FLB_OPT = $FLB_OPT"
sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-7 90
Expand All @@ -73,8 +79,8 @@ jobs:
sudo usermod -a -G systemd-journal $(id -un)
sudo -E su -p $(id -un) -c "PATH=$PATH ci/scripts/run-unit-tests.sh"
env:
CC: ${{ matrix.compiler }}
CXX: ${{ matrix.compiler }}
CC: ${{ matrix.compiler.cc }}
CXX: ${{ matrix.compiler.cxx }}
FLB_OPT: ${{ matrix.flb_option }}

run-macos-unit-tests:
Expand Down Expand Up @@ -128,7 +134,8 @@ jobs:
omit_option: "-DFLB_WITHOUT_flb-it-utils=1 -DFLB_WITHOUT_flb-it-pack=1"
global_option: "-DFLB_BACKTRACE=Off -DFLB_SHARED_LIB=Off -DFLB_DEBUG=On -DFLB_ALL=On -DFLB_EXAMPLES=Off"
unit_test_option: "-DFLB_TESTS_INTERNAL=On"
compiler: gcc
compiler_cc: gcc
compiler_cxx: g++
steps:
- name: Checkout Fluent Bit code
uses: actions/checkout@v4
Expand Down Expand Up @@ -156,15 +163,15 @@ jobs:
export FLB_UNIT_TEST_OPTION="${{ matrix.config.unit_test_option }}"
export FLB_OPT="${FLB_OPTION} ${GLOBAL_OPTION} ${FLB_UNIT_TEST_OPTION} ${FLB_OMIT_OPTION}"

echo "CC = ${{ matrix.config.compiler }}, CXX = ${{ matrix.config.compiler }}, FLB_OPT = $FLB_OPT"
echo "CC = ${{ matrix.config.compiler_cc }}, CXX = ${{ matrix.config.compiler_cxx }}, FLB_OPT = $FLB_OPT"

cmake ${FLB_OPT} ../
make -j $nparallel
ctest -j $nparallel --build-run-dir . --output-on-failure
working-directory: build
env:
CC: ${{ matrix.config.compiler }}
CXX: ${{ matrix.config.compiler }}
CC: ${{ matrix.config.compiler_cc }}
CXX: ${{ matrix.config.compiler_cxx }}

run-qemu-ubuntu-unit-tests:
# We chain this after Linux one as there are CPU time costs for QEMU emulation
Expand Down Expand Up @@ -208,7 +215,7 @@ jobs:
export FLB_UNIT_TEST_OPTION="-DFLB_TESTS_INTERNAL=On"
export FLB_OPT="${FLB_OPTION} ${GLOBAL_OPTION} ${FLB_UNIT_TEST_OPTION} ${FLB_OMIT_OPTION}"
export CC=gcc
export CXX=gcc
export CXX=g++

echo "CC = $CC, CXX = $CXX, FLB_OPT = $FLB_OPT"

Expand Down
12 changes: 12 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,7 @@ option(FLB_COVERAGE "Build with code-coverage" No)
option(FLB_JEMALLOC "Build with Jemalloc support" No)
option(FLB_REGEX "Build with Regex support" Yes)
option(FLB_UTF8_ENCODER "Build with UTF8 encoding support" Yes)
option(FLB_UNICODE_ENCODER "Build with Unicode (UTF-16LE, UTF-16BE) encoding support" Yes)
option(FLB_PARSER "Build with Parser support" Yes)
option(FLB_TLS "Build with SSL/TLS support" Yes)
option(FLB_BINARY "Build executable binary" Yes)
Expand Down Expand Up @@ -333,6 +334,9 @@ endif()

if(FLB_COVERAGE)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -g -O0 --coverage -fprofile-arcs -ftest-coverage")
if (FLB_UNICODE_ENCODER)
set(CMAKE_CXX_FLAGS "${CMAKE_C_FLAGS} -g -O0 --coverage -fprofile-arcs -ftest-coverage")
endif()
set(CMAKE_BUILD_TYPE "Debug")
endif()

Expand Down Expand Up @@ -474,6 +478,14 @@ if(FLB_UTF8_ENCODER)
add_subdirectory(${FLB_PATH_LIB_TUTF8E} EXCLUDE_FROM_ALL)
endif()

# simdutf
if(FLB_UNICODE_ENCODER)
enable_language(CXX)
set (CMAKE_CXX_STANDARD 11)
add_subdirectory(${FLB_PATH_LIB_SIMDUTF} EXCLUDE_FROM_ALL)
FLB_DEFINITION(FLB_HAVE_UNICODE_ENCODER)
endif()

# snappy
add_subdirectory(${FLB_PATH_LIB_SNAPPY} EXCLUDE_FROM_ALL)

Expand Down
1 change: 1 addition & 0 deletions cmake/libraries.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -23,4 +23,5 @@ set(FLB_PATH_LIB_CARES "lib/c-ares-1.33.1")
set(FLB_PATH_LIB_SNAPPY "lib/snappy-fef67ac")
set(FLB_PATH_LIB_RDKAFKA "lib/librdkafka-2.4.0")
set(FLB_PATH_LIB_RING_BUFFER "lib/lwrb")
set(FLB_PATH_LIB_SIMDUTF "lib/simdutf-amalgamation-5.5.0")
set(FLB_PATH_LIB_WASM_MICRO_RUNTIME "lib/wasm-micro-runtime-WAMR-1.3.0")
3 changes: 2 additions & 1 deletion dockerfiles/Dockerfile.centos7
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ RUN cmake3 -DCMAKE_INSTALL_PREFIX=/opt/fluent-bit/ -DCMAKE_INSTALL_SYSCONFDIR=/e
-DFLB_OUT_KAFKA=On \
-DFLB_JEMALLOC=On \
-DFLB_CHUNK_TRACE=On \
-DFLB_OUT_PGSQL=On ../
-DFLB_OUT_PGSQL=On \
-DFLB_UNICODE_ENCODER=Off ../

RUN make -j "$(getconf _NPROCESSORS_ONLN)"
9 changes: 9 additions & 0 deletions include/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,15 @@ install(FILES ${headers}
COMPONENT headers
PERMISSIONS OWNER_READ OWNER_WRITE GROUP_READ WORLD_READ)

# simdutf
if(FLB_UNICODE_ENCODER)
file(GLOB headers "fluent-bit/simdutf/*.h")
install(FILES ${headers}
DESTINATION ${FLB_INSTALL_INCLUDEDIR}/fluent-bit/simdutf/
COMPONENT headers
PERMISSIONS OWNER_READ OWNER_WRITE GROUP_READ WORLD_READ)
endif()

install(FILES "../lib/monkey/include/monkey/mk_core.h"
DESTINATION ${FLB_INSTALL_INCLUDEDIR}/monkey/
COMPONENT headers-extra
Expand Down
91 changes: 91 additions & 0 deletions include/fluent-bit/simdutf/flb_simdutf_connector.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */

/* Fluent Bit
* ==========
* Copyright (C) 2015-2024 The Fluent Bit Authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#ifndef FLB_SIMDUTF_CONNECTOR_H
#define FLB_SIMDUTF_CONNECTOR_H

#ifdef __cplusplus
extern "C" {
#endif /* __cplusplus */

#ifdef __APPLE__
#include <stdint.h>
#include <stddef.h>
typedef int_least16_t CHAR16_T;
#else
#include <uchar.h>
typedef char16_t CHAR16_T;
#endif

#define FLB_SIMDUTF_CONNECTOR_CONVERT_OK 0
#define FLB_SIMDUTF_CONNECTOR_CONVERT_NOP -1
#define FLB_SIMDUTF_CONNECTOR_CONVERT_UNSUPPORTED -2
#define FLB_SIMDUTF_CONNECTOR_CONVERT_ERROR -3

/* Just copy and pasted from amalugamated simdutf.h to remove C++ namespace */
enum flb_simdutf_encoding_type {
FLB_SIMDUTF_ENCODING_TYPE_UTF8 = 1, /* BOM 0xef 0xbb 0xbf */
FLB_SIMDUTF_ENCODING_TYPE_UTF16_LE = 2, /* BOM 0xff 0xfe */
FLB_SIMDUTF_ENCODING_TYPE_UTF16_BE = 4, /* BOM 0xfe 0xff */
FLB_SIMDUTF_ENCODING_TYPE_UTF32_LE = 8, /* BOM 0xff 0xfe 0x00 0x00 */
FLB_SIMDUTF_ENCODING_TYPE_UTF32_BE = 16, /* BOM 0x00 0x00 0xfe 0xff */
FLB_SIMDUTF_ENCODING_TYPE_Latin1 = 32,

FLB_SIMDUTF_ENCODING_TYPE_UNSPECIFIED = 0,
FLB_SIMDUTF_ENCODING_TYPE_UNICODE_AUTO = 1 << 10, /* Automatically detecting flag*/
};

enum flb_simdutf_error_code {
FLB_SIMDUTF_ERROR_CODE_SUCCESS = FLB_SIMDUTF_CONNECTOR_CONVERT_OK,
FLB_SIMDUTF_ERROR_CODE_HEADER_BITS,
FLB_SIMDUTF_ERROR_CODE_TOO_SHORT,
FLB_SIMDUTF_ERROR_CODE_TOO_LONG,
FLB_SIMDUTF_ERROR_CODE_OVERLONG,
FLB_SIMDUTF_ERROR_CODE_TOO_LARGE,
FLB_SIMDUTF_ERROR_CODE_SURROGATE,
FLB_SIMDUTF_ERROR_CODE_INVALID_BASE64_CHARACTER,
FLB_SIMDUTF_ERROR_CODE_BASE64_INPUT_REMAINDER,
FLB_SIMDUTF_ERROR_CODE_OUTPUT_BUFFER_TOO_SMALL,
FLB_SIMDUTF_ERROR_CODE_OTHER,
};

int flb_simdutf_connector_utf8_length_from_utf16le(const CHAR16_T *buf, size_t len);
int flb_simdutf_connector_utf8_length_from_utf16be(const CHAR16_T *buf, size_t len);
int flb_simdutf_connector_utf8_length_from_utf16(const CHAR16_T *buf, size_t len);
int flb_simdutf_connector_validate_utf8(const char *buf, size_t len);
int flb_simdutf_connector_validate_utf16le(const CHAR16_T *buf, size_t len);
int flb_simdutf_connector_validate_utf16be(const CHAR16_T *buf, size_t len);
int flb_simdutf_connector_validate_utf16(const CHAR16_T *buf, size_t len);
int flb_simdutf_connector_convert_utf16le_to_utf8(const CHAR16_T *buf, size_t len,
char **utf8_output, size_t *out_size);
int flb_simdutf_connector_convert_utf16be_to_utf8(const CHAR16_T *buf, size_t len,
char **utf8_output, size_t *out_size);
int flb_simdutf_connector_convert_utf16_to_utf8(const CHAR16_T *buf, size_t len,
char **utf8_output, size_t *out_size);
void flb_simdutf_connector_change_endianness_utf16(const CHAR16_T *input, size_t length, CHAR16_T *output);
int flb_simdutf_connector_detect_encodings(const char *input, size_t length);
int flb_simdutf_connector_convert_from_unicode(int preferred_encoding,
const char *input, size_t length,
char **output, size_t *out_size);

#ifdef __cplusplus
}
#endif /* __cplusplus */

#endif
5 changes: 5 additions & 0 deletions lib/simdutf-amalgamation-5.5.0/AUTHORS
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
Wojciech Muła
Daniel Lemire
Nicolas Boyer
John Keiser
Robert Clausecker
5 changes: 5 additions & 0 deletions lib/simdutf-amalgamation-5.5.0/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
set(src
src/simdutf/simdutf.cpp
)

add_library(simdutf-static STATIC ${src})
Loading
Loading