From 70d66b9dfb8fc01d73b5f6bd890043ad5eeaa1b3 Mon Sep 17 00:00:00 2001 From: coldWater Date: Tue, 10 Sep 2024 17:36:06 +0800 Subject: [PATCH 01/40] dma write Signed-off-by: coldWater --- Cargo.lock | 11 +- src/query/pipeline/transforms/Cargo.toml | 1 + .../pipeline/transforms/src/processors/dma.rs | 325 ++++++++++++++++++ .../pipeline/transforms/src/processors/mod.rs | 2 + .../transforms/src/processors/spiller_disk.rs | 70 ++++ 5 files changed, 404 insertions(+), 5 deletions(-) create mode 100644 src/query/pipeline/transforms/src/processors/dma.rs create mode 100644 src/query/pipeline/transforms/src/processors/spiller_disk.rs diff --git a/Cargo.lock b/Cargo.lock index 3aaa6c612aba..c1f81442a0a1 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3996,6 +3996,7 @@ dependencies = [ "databend-common-pipeline-core", "itertools 0.10.5", "jsonb", + "libc", "log", "match-template", "rand 0.8.5", @@ -9625,9 +9626,9 @@ dependencies = [ [[package]] name = "libc" -version = "0.2.155" +version = "0.2.158" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "97b3888a4aecf77e811145cadf6eef5901f4782c53886191b2f693f24761847c" +checksum = "d8adc4bb1803a324070e64a98ae98f38934d91957a99cfb3a43dcbc01bc56439" [[package]] name = "libflate" @@ -9660,7 +9661,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4979f22fdb869068da03c9f7528f8297c6fd2606bc3a4affe42e6a823fdb8da4" dependencies = [ "cfg-if 1.0.0", - "windows-targets 0.48.5", + "windows-targets 0.52.6", ] [[package]] @@ -15361,8 +15362,8 @@ version = "1.6.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "97fee6b57c6a41524a810daee9286c02d7752c4253064d0b05472833a438f675" dependencies = [ - "cfg-if 0.1.10", - "rand 0.7.3", + "cfg-if 1.0.0", + "rand 0.8.5", "static_assertions", ] diff --git a/src/query/pipeline/transforms/Cargo.toml b/src/query/pipeline/transforms/Cargo.toml index 255812696780..0660bbb611b6 100644 --- a/src/query/pipeline/transforms/Cargo.toml +++ b/src/query/pipeline/transforms/Cargo.toml @@ -18,6 +18,7 @@ databend-common-exception = { workspace = true } databend-common-expression = { workspace = true } databend-common-pipeline-core = { workspace = true } jsonb = { workspace = true } +libc = "0.2.158" log = { workspace = true } match-template = { workspace = true } serde = { workspace = true } diff --git a/src/query/pipeline/transforms/src/processors/dma.rs b/src/query/pipeline/transforms/src/processors/dma.rs new file mode 100644 index 000000000000..ac80a24a91ad --- /dev/null +++ b/src/query/pipeline/transforms/src/processors/dma.rs @@ -0,0 +1,325 @@ +// Copyright 2021 Datafuse Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::alloc::alloc; +use std::alloc::dealloc; +use std::alloc::Layout; +use std::io; +use std::io::IoSlice; +use std::ops::Deref; +use std::ops::DerefMut; +use std::os::unix::io::AsRawFd; +use std::path::Path; + +use databend_common_base::runtime::spawn_blocking; +use tokio::fs::File; + +/// An aligned buffer used to perform io on a `DmaFile`. +#[derive(Debug)] +pub struct DmaBuffer { + cap: usize, + len: usize, + align: usize, + data: *mut u8, +} + +unsafe impl Send for DmaBuffer {} + +impl DmaBuffer { + /// Allocates an aligned buffer. + pub(crate) fn new(cap: usize, align: usize) -> DmaBuffer { + let layout = Layout::from_size_align(cap, align).unwrap(); + let data = unsafe { alloc(layout) }; + Self { + data, + cap, + align, + len: 0, + } + } + + /// Sets the internal length of the buffer. The caller must ensure that the memory is + /// initialized until `new_len` before calling. + pub unsafe fn set_len(&mut self, new_len: usize) { + debug_assert!(new_len <= self.cap); + self.len = new_len; + } + + /// Returns the number of initialized bytes in the buffer. + pub fn len(&self) -> usize { + self.len + } + + /// Returns the capacity for this `DmaBuffer`. + pub fn capacity(&self) -> usize { + self.cap + } + + /// Returns the remining capacity in the buffer. + pub fn remaining(&self) -> usize { + self.capacity() - self.len() + } + + /// Returns a raw pointer to the buffer's data. + pub fn as_ptr(&self) -> *const u8 { + self.data as *const _ + } + + /// Returns an unsafe mutable pointer to the buffer's data. + pub fn as_mut_ptr(&mut self) -> *mut u8 { + self.data + } + + /// Extends `self` with the content of `other`. + /// Panics if `self` doesn't have enough capacity left to contain `other`. + pub fn extend_from_slice(&mut self, other: &[u8]) { + assert!(other.len() <= self.remaining()); + + let buf = unsafe { std::slice::from_raw_parts_mut(self.data.add(self.len()), other.len()) }; + buf.copy_from_slice(other); + self.len += other.len(); + } +} + +impl Deref for DmaBuffer { + type Target = [u8]; + + fn deref(&self) -> &Self::Target { + unsafe { std::slice::from_raw_parts(self.data, self.len()) } + } +} + +impl DerefMut for DmaBuffer { + fn deref_mut(&mut self) -> &mut Self::Target { + unsafe { std::slice::from_raw_parts_mut(self.data, self.len()) } + } +} + +impl Drop for DmaBuffer { + fn drop(&mut self) { + let layout = Layout::from_size_align(self.cap, self.align).unwrap(); + unsafe { dealloc(self.data, layout) } + } +} + +/// A `DmaFile` is similar to a `File`, but it is openened with the `O_DIRECT` file in order to +/// perform direct IO. +pub struct DmaFile { + fd: File, + alignment: usize, + buf: DmaBuffer, +} + +impl DmaFile { + /// Attempts to open a file in read-only mode. + // pub async fn open(path: impl AsRef) -> io::Result { + // let options = OpenOptions::new(); + // const O_DIRECT: i32 = 0x00040000; + + // let file = options.read(true).custom_flags(O_DIRECT).open(path).await?; + + // let statfs = fstatfs(&fd).await?; + // // TODO: the actual aligment may differ from the optimal io size? we should probably get + // // this information from the the device the file lives on. + // let alignment = statfs.f_bsize.max(512) as usize; + // Ok(DmaFile { fd, alignment }); + + // OpenOptions::new().read(true).open_dma(path).await + // } + + /// Opens a file in write-only mode. + pub async fn create(path: impl AsRef) -> io::Result { + let file = File::options() + .write(true) + .create(true) + .truncate(true) + .custom_flags(libc::O_DIRECT) + .open(path) + .await?; + + open_dma(file).await + } + + /// Aligns `value` up to the memory alignement requirement for this file. + pub fn align_up(&self, value: usize) -> usize { + (value + self.alignment - 1) & !(self.alignment - 1) + } + + /// Aligns `value` down to the memory alignement requirement for this file. + pub fn align_down(&self, value: usize) -> usize { + value & !(self.alignment - 1) + } + + /// Return the alignement requirement for this file. The returned alignement value can be used + /// to allocate a buffer to use with this file: + pub fn alignment(&self) -> usize { + self.alignment + } + + pub fn buffer(&self) -> &DmaBuffer { + &self.buf + } + + pub fn mut_buffer(&mut self) -> &mut DmaBuffer { + &mut self.buf + } + + fn write_direct(&mut self) -> io::Result { + let rt = unsafe { + libc::write( + self.fd.as_raw_fd(), + self.buf.as_ptr().cast(), + self.buf.capacity(), + ) + }; + unsafe { self.buf.set_len(0) } + if rt >= 0 { + Ok(rt as usize) + } else { + Err(io::Error::last_os_error()) + } + } + + fn truncate(&self, length: usize) -> io::Result { + let rt = unsafe { libc::ftruncate64(self.fd.as_raw_fd(), length as i64) }; + if rt >= 0 { + Ok(rt as usize) + } else { + Err(io::Error::last_os_error()) + } + } + + pub async fn close(self) -> io::Result<()> { + todo!() + } +} + +async fn open_dma(file: File) -> io::Result { + let statfs = fstatfs(&file).await?; + // TODO: the actual aligment may differ from the optimal io size? we should probably get + // this information from the the device the file lives on. + let alignment = statfs.f_bsize.max(512) as usize; + let buf = DmaBuffer::new(alignment, alignment); + Ok(DmaFile { + fd: file, + alignment, + buf, + }) +} + +async fn fstatfs(file: &File) -> io::Result { + let fd = file.as_raw_fd(); + asyncify(move || { + let mut statfs = std::mem::MaybeUninit::::uninit(); + let ret = unsafe { libc::fstatfs(fd, statfs.as_mut_ptr()) }; + if ret == -1 { + return Err(io::Error::last_os_error()); + } + + Ok(unsafe { statfs.assume_init() }) + }) + .await +} + +async fn asyncify(f: F) -> io::Result +where + F: FnOnce() -> io::Result + Send + 'static, + T: Send + 'static, +{ + match spawn_blocking(f).await { + Ok(res) => res, + Err(_) => Err(io::Error::new( + io::ErrorKind::Other, + "background task failed", + )), + } +} + +pub async fn dma_write_file_vectored<'a>( + path: impl AsRef, + bufs: &'a [IoSlice<'a>], +) -> io::Result { + let mut file = DmaFile::create(path.as_ref()).await?; + + for buf in bufs { + let mut buf = &buf[..]; + + while !buf.is_empty() { + if file.buffer().remaining() == 0 { + file = asyncify(move || file.write_direct().map(|_| file)).await?; + } + + let dst = file.mut_buffer(); + let remaining = dst.remaining(); + let n = buf.len().min(remaining); + let (left, right) = buf.split_at(n); + dst.extend_from_slice(left); + buf = right; + } + } + + let file_length = bufs.iter().map(|buf| buf.len()).sum(); + let dst = file.mut_buffer(); + if dst.remaining() > 0 { + unsafe { dst.set_len(dst.cap) } + file = asyncify(move || file.write_direct().map(|_| file)).await?; + asyncify(move || file.truncate(file_length).map(|_| file)).await?; + } + + Ok(file_length) +} + +// fn write(&mut self, mut buf: &[u8]) -> io::Result { +// let data_len = buf.len(); +// if self.n != 0 { +// let end = self.n + buf.len(); +// if end < self.buf.len() { +// self.buf[self.n..end].copy_from_slice(buf); +// self.n = end; +// return Ok(buf.len()); +// } else { +// let r = self.buf.len() - self.n; +// self.buf[self.n..].copy_from_slice(&buf[..r]); +// let n = self.write_direct(&self.buf)?; +// assert_eq!(n, self.buf.len()); +// self.n = 0; +// buf = &buf[r..]; +// } +// } +// while buf.len() >= SIZE_OF_BLOCK { +// let r = buf.len() & ALIGN_SIZE_OF_BLOCK; +// let n = self.write_direct(&buf[..r])?; +// buf = &buf[n..]; +// } +// if !buf.is_empty() { +// self.buf[0..buf.len()].copy_from_slice(buf); +// self.n = buf.len(); +// } +// Ok(data_len) +// } + +#[cfg(test)] +mod tests { + use super::*; + + #[tokio::test] + async fn test_xxx() { + let data1 = b"aaa"; + + let bufs = vec![IoSlice::new(data1)]; + let length = dma_write_file_vectored("./test_file", &bufs).await.unwrap(); + + println!("{length}"); + } +} diff --git a/src/query/pipeline/transforms/src/processors/mod.rs b/src/query/pipeline/transforms/src/processors/mod.rs index a9c76cf3f0a3..3edb3119a8b8 100644 --- a/src/query/pipeline/transforms/src/processors/mod.rs +++ b/src/query/pipeline/transforms/src/processors/mod.rs @@ -12,6 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. +mod dma; mod transforms; +mod spiller_disk; pub use transforms::*; diff --git a/src/query/pipeline/transforms/src/processors/spiller_disk.rs b/src/query/pipeline/transforms/src/processors/spiller_disk.rs new file mode 100644 index 000000000000..34c7654ce937 --- /dev/null +++ b/src/query/pipeline/transforms/src/processors/spiller_disk.rs @@ -0,0 +1,70 @@ +// Copyright 2021 Datafuse Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::HashMap; +use std::io::IoSlice; +use std::path::PathBuf; + +use databend_common_base::base::GlobalUniqName; +use databend_common_exception::ErrorCode; +use databend_common_exception::Result; +use databend_common_expression::arrow::serialize_column; +use databend_common_expression::DataBlock; + +pub struct DiskSpiller { + root: PathBuf, + + max_size: usize, + /// Record columns layout for spilled data, will be used when read data from disk + pub columns_layout: HashMap>, +} + +impl DiskSpiller { + /// Write a [`DataBlock`] to storage. + pub async fn spill_block(&mut self, data: DataBlock) -> Result { + let unique_name = GlobalUniqName::unique(); + + let location = self.root.join(unique_name); + + // let file = DmaFile::create(location.as_path()) + // .await + // .map_err(error_from_glommio)?; + // let mut writer = DmaStreamWriterBuilder::new(file).build(); + + // let data = data.convert_to_full(); + // let columns = data.columns(); + + // let location = location.as_os_str().to_str().unwrap().to_string(); + + // let columns_data = columns + // .iter() + // .map(|entry| serialize_column(entry.value.as_column().unwrap())) + // .collect::>(); + // let layouts = columns_data + // .iter() + // .map(|bytes| bytes.len()) + // .collect::>(); + // self.columns_layout.insert(location.clone(), layouts); + // let bufs = columns_data + // .iter() + // .map(|data| IoSlice::new(&data)) + // .collect::>(); + // writer.write_vectored(&bufs).await?; + // writer.close().await?; + + todo!(); + + // Ok(location) + } +} From 5666858c10198ecfedcf2fc228592ed241d5e646 Mon Sep 17 00:00:00 2001 From: coldWater Date: Wed, 11 Sep 2024 20:52:51 +0800 Subject: [PATCH 02/40] partition spill to disk Signed-off-by: coldWater --- Cargo.lock | 4 +- src/common/cache/Cargo.toml | 3 + src/common/cache/src/disk.rs | 15 + .../cache/src/disk}/dma.rs | 268 +++++++++++++----- src/common/cache/src/lib.rs | 4 + src/query/pipeline/transforms/Cargo.toml | 1 - .../pipeline/transforms/src/processors/mod.rs | 2 - .../transforms/src/processors/spiller_disk.rs | 70 ----- .../src/pipelines/builders/builder_window.rs | 1 + ...transform_window_partition_spill_reader.rs | 169 +++++------ ...transform_window_partition_spill_writer.rs | 160 +++++++---- .../partition_by/window_partition_meta.rs | 9 +- 12 files changed, 423 insertions(+), 283 deletions(-) create mode 100644 src/common/cache/src/disk.rs rename src/{query/pipeline/transforms/src/processors => common/cache/src/disk}/dma.rs (51%) delete mode 100644 src/query/pipeline/transforms/src/processors/spiller_disk.rs diff --git a/Cargo.lock b/Cargo.lock index c1f81442a0a1..c61d7d5276a1 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3212,8 +3212,11 @@ dependencies = [ name = "databend-common-cache" version = "0.1.0" dependencies = [ + "databend-common-base", "hashbrown 0.14.5", "hashlink 0.8.4", + "libc", + "tokio", ] [[package]] @@ -3996,7 +3999,6 @@ dependencies = [ "databend-common-pipeline-core", "itertools 0.10.5", "jsonb", - "libc", "log", "match-template", "rand 0.8.5", diff --git a/src/common/cache/Cargo.toml b/src/common/cache/Cargo.toml index cfd7bccd6229..793982f45d8c 100644 --- a/src/common/cache/Cargo.toml +++ b/src/common/cache/Cargo.toml @@ -11,8 +11,11 @@ doctest = false test = true [dependencies] +databend-common-base = { workspace = true } hashbrown = { workspace = true } hashlink = "0.8" +libc = "0.2.158" +tokio = { workspace = true } [dev-dependencies] diff --git a/src/common/cache/src/disk.rs b/src/common/cache/src/disk.rs new file mode 100644 index 000000000000..740c34018d60 --- /dev/null +++ b/src/common/cache/src/disk.rs @@ -0,0 +1,15 @@ +// Copyright 2021 Datafuse Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +pub mod dma; diff --git a/src/query/pipeline/transforms/src/processors/dma.rs b/src/common/cache/src/disk/dma.rs similarity index 51% rename from src/query/pipeline/transforms/src/processors/dma.rs rename to src/common/cache/src/disk/dma.rs index ac80a24a91ad..78d0b7b71be7 100644 --- a/src/query/pipeline/transforms/src/processors/dma.rs +++ b/src/common/cache/src/disk/dma.rs @@ -17,13 +17,16 @@ use std::alloc::dealloc; use std::alloc::Layout; use std::io; use std::io::IoSlice; +use std::io::SeekFrom; use std::ops::Deref; use std::ops::DerefMut; +use std::ops::Range; use std::os::unix::io::AsRawFd; use std::path::Path; use databend_common_base::runtime::spawn_blocking; use tokio::fs::File; +use tokio::io::AsyncSeekExt; /// An aligned buffer used to perform io on a `DmaFile`. #[derive(Debug)] @@ -38,7 +41,7 @@ unsafe impl Send for DmaBuffer {} impl DmaBuffer { /// Allocates an aligned buffer. - pub(crate) fn new(cap: usize, align: usize) -> DmaBuffer { + fn new(cap: usize, align: usize) -> DmaBuffer { let layout = Layout::from_size_align(cap, align).unwrap(); let data = unsafe { alloc(layout) }; Self { @@ -115,75 +118,71 @@ impl Drop for DmaBuffer { /// A `DmaFile` is similar to a `File`, but it is openened with the `O_DIRECT` file in order to /// perform direct IO. -pub struct DmaFile { +struct DmaFile { fd: File, alignment: usize, - buf: DmaBuffer, + buf: Option, } impl DmaFile { /// Attempts to open a file in read-only mode. - // pub async fn open(path: impl AsRef) -> io::Result { - // let options = OpenOptions::new(); - // const O_DIRECT: i32 = 0x00040000; - - // let file = options.read(true).custom_flags(O_DIRECT).open(path).await?; - - // let statfs = fstatfs(&fd).await?; - // // TODO: the actual aligment may differ from the optimal io size? we should probably get - // // this information from the the device the file lives on. - // let alignment = statfs.f_bsize.max(512) as usize; - // Ok(DmaFile { fd, alignment }); + async fn open(path: impl AsRef) -> io::Result { + let file = File::options() + .read(true) + .custom_flags(libc::O_DIRECT) + .open(path) + .await?; - // OpenOptions::new().read(true).open_dma(path).await - // } + open_dma(file).await + } /// Opens a file in write-only mode. - pub async fn create(path: impl AsRef) -> io::Result { + async fn create(path: impl AsRef) -> io::Result { let file = File::options() .write(true) .create(true) .truncate(true) - .custom_flags(libc::O_DIRECT) + .custom_flags(libc::O_DIRECT | libc::O_EXCL) .open(path) .await?; open_dma(file).await } + fn set_buffer(&mut self, buf: DmaBuffer) { + self.buf = Some(buf) + } + /// Aligns `value` up to the memory alignement requirement for this file. pub fn align_up(&self, value: usize) -> usize { - (value + self.alignment - 1) & !(self.alignment - 1) + align_up(self.alignment, value) } /// Aligns `value` down to the memory alignement requirement for this file. + #[allow(dead_code)] pub fn align_down(&self, value: usize) -> usize { - value & !(self.alignment - 1) + align_down(self.alignment, value) } /// Return the alignement requirement for this file. The returned alignement value can be used /// to allocate a buffer to use with this file: + #[allow(dead_code)] pub fn alignment(&self) -> usize { self.alignment } - pub fn buffer(&self) -> &DmaBuffer { - &self.buf + fn buffer(&self) -> &DmaBuffer { + self.buf.as_ref().unwrap() } - pub fn mut_buffer(&mut self) -> &mut DmaBuffer { - &mut self.buf + fn mut_buffer(&mut self) -> &mut DmaBuffer { + self.buf.as_mut().unwrap() } fn write_direct(&mut self) -> io::Result { - let rt = unsafe { - libc::write( - self.fd.as_raw_fd(), - self.buf.as_ptr().cast(), - self.buf.capacity(), - ) - }; - unsafe { self.buf.set_len(0) } + let buf = self.buffer(); + let rt = unsafe { libc::write(self.fd.as_raw_fd(), buf.as_ptr().cast(), buf.len()) }; + unsafe { self.mut_buffer().set_len(0) } if rt >= 0 { Ok(rt as usize) } else { @@ -191,6 +190,19 @@ impl DmaFile { } } + fn read_direct(&mut self) -> io::Result { + let fd = self.fd.as_raw_fd(); + let buf = self.mut_buffer(); + let rt = unsafe { libc::read(fd, buf.as_mut_ptr().cast(), buf.capacity()) }; + if rt >= 0 { + debug_assert_eq!(buf.capacity(), rt as usize); + unsafe { buf.set_len(rt as usize) } + Ok(rt as usize) + } else { + Err(io::Error::last_os_error()) + } + } + fn truncate(&self, length: usize) -> io::Result { let rt = unsafe { libc::ftruncate64(self.fd.as_raw_fd(), length as i64) }; if rt >= 0 { @@ -200,21 +212,29 @@ impl DmaFile { } } - pub async fn close(self) -> io::Result<()> { - todo!() + async fn seek(&mut self, pos: SeekFrom) -> io::Result { + self.fd.seek(pos).await } } +pub fn align_up(alignment: usize, value: usize) -> usize { + (value + alignment - 1) & !(alignment - 1) +} + +pub fn align_down(alignment: usize, value: usize) -> usize { + value & !(alignment - 1) +} + async fn open_dma(file: File) -> io::Result { let statfs = fstatfs(&file).await?; // TODO: the actual aligment may differ from the optimal io size? we should probably get // this information from the the device the file lives on. let alignment = statfs.f_bsize.max(512) as usize; - let buf = DmaBuffer::new(alignment, alignment); + Ok(DmaFile { fd: file, alignment, - buf, + buf: None, }) } @@ -251,6 +271,8 @@ pub async fn dma_write_file_vectored<'a>( bufs: &'a [IoSlice<'a>], ) -> io::Result { let mut file = DmaFile::create(path.as_ref()).await?; + let buf = DmaBuffer::new(file.alignment, file.alignment); + file.set_buffer(buf); for buf in bufs { let mut buf = &buf[..]; @@ -270,56 +292,154 @@ pub async fn dma_write_file_vectored<'a>( } let file_length = bufs.iter().map(|buf| buf.len()).sum(); - let dst = file.mut_buffer(); - if dst.remaining() > 0 { - unsafe { dst.set_len(dst.cap) } - file = asyncify(move || file.write_direct().map(|_| file)).await?; - asyncify(move || file.truncate(file_length).map(|_| file)).await?; + let len = file.buffer().len(); + if len > 0 { + let align_up = file.align_up(len); + if align_up == len { + asyncify(move || file.write_direct().map(|_| file)).await?; + } else { + let dst = file.mut_buffer(); + unsafe { dst.set_len(align_up) } + file = asyncify(move || file.write_direct().map(|_| file)).await?; + asyncify(move || file.truncate(file_length).map(|_| file)).await?; + } } Ok(file_length) } -// fn write(&mut self, mut buf: &[u8]) -> io::Result { -// let data_len = buf.len(); -// if self.n != 0 { -// let end = self.n + buf.len(); -// if end < self.buf.len() { -// self.buf[self.n..end].copy_from_slice(buf); -// self.n = end; -// return Ok(buf.len()); -// } else { -// let r = self.buf.len() - self.n; -// self.buf[self.n..].copy_from_slice(&buf[..r]); -// let n = self.write_direct(&self.buf)?; -// assert_eq!(n, self.buf.len()); -// self.n = 0; -// buf = &buf[r..]; -// } -// } -// while buf.len() >= SIZE_OF_BLOCK { -// let r = buf.len() & ALIGN_SIZE_OF_BLOCK; -// let n = self.write_direct(&buf[..r])?; -// buf = &buf[n..]; -// } -// if !buf.is_empty() { -// self.buf[0..buf.len()].copy_from_slice(buf); -// self.n = buf.len(); -// } -// Ok(data_len) -// } +pub async fn dma_read_file( + path: impl AsRef, + mut writer: impl io::Write, +) -> io::Result { + let mut file = DmaFile::open(path.as_ref()).await?; + let buf = DmaBuffer::new(file.alignment, file.alignment); + file.set_buffer(buf); + + let mut n = 0; + loop { + file = asyncify(move || file.read_direct().map(|_| file)).await?; + + let buf = file.buffer(); + if buf.is_empty() { + return Ok(n); + } + n += buf.len(); + writer.write_all(buf)?; + let eof = buf.remaining() > 0; + unsafe { file.mut_buffer().set_len(0) } + if eof { + return Ok(n); + } + } +} + +pub async fn dma_read_file_range( + path: impl AsRef, + range: Range, +) -> io::Result<(DmaBuffer, Range)> { + if range.is_empty() { + return Ok((DmaBuffer::new(2, 2), 0..0)); + } + + let mut file = DmaFile::open(path.as_ref()).await?; + + let align_start = file.align_down(range.start as usize); + let align_end = file.align_up(range.end as usize); + + let buf = DmaBuffer::new(align_end - align_start, file.alignment); + file.set_buffer(buf); + + let offset = file.seek(SeekFrom::Start(align_start as u64)).await?; + + if offset as usize != align_start { + return Err(io::Error::new( + io::ErrorKind::InvalidInput, + "range out of range", + )); + } + + file = asyncify(move || file.read_direct().map(|_| file)).await?; + + let rt_range = range.start as usize - align_start..range.end as usize - align_start; + Ok((file.buf.unwrap(), rt_range)) +} #[cfg(test)] mod tests { use super::*; #[tokio::test] - async fn test_xxx() { - let data1 = b"aaa"; + async fn test_read_write() { + let _ = std::fs::remove_file("test_file"); + + run_test(0).await.unwrap(); + run_test(100).await.unwrap(); + run_test(200).await.unwrap(); + + run_test(4096 - 1).await.unwrap(); + run_test(4096).await.unwrap(); + run_test(4096 + 1).await.unwrap(); + + run_test(4096 * 2 - 1).await.unwrap(); + run_test(4096 * 2).await.unwrap(); + run_test(4096 * 2 + 1).await.unwrap(); + } - let bufs = vec![IoSlice::new(data1)]; - let length = dma_write_file_vectored("./test_file", &bufs).await.unwrap(); + async fn run_test(n: usize) -> io::Result<()> { + let want = (0..n).map(|i| (i % 256) as u8).collect::>(); - println!("{length}"); + let bufs = vec![IoSlice::new(&want)]; + let length = dma_write_file_vectored("test_file", &bufs).await?; + + assert_eq!(length, want.len()); + + let mut got = Vec::new(); + + let length = dma_read_file("test_file", &mut got).await?; + assert_eq!(length, want.len()); + assert_eq!(got, want); + + std::fs::remove_file("test_file")?; + Ok(()) + } + + #[tokio::test] + async fn test_range_read() { + let _ = std::fs::remove_file("test_file"); + let n: usize = 4096 * 2; + + let want = (0..n).map(|i| (i % 256) as u8).collect::>(); + + let bufs = vec![IoSlice::new(&want)]; + dma_write_file_vectored("test_file", &bufs).await.unwrap(); + + let got = dma_read_file_range("test_file", 0..10).await.unwrap(); + let got = got.0[got.1].to_vec(); + assert_eq!(&want[0..10], got); + + let got = dma_read_file_range("test_file", 10..30).await.unwrap(); + let got = got.0[got.1].to_vec(); + assert_eq!(&want[10..30], got); + + let got = dma_read_file_range("test_file", 4096 - 5..4096 + 5) + .await + .unwrap(); + let got = got.0[got.1].to_vec(); + assert_eq!(&want[4096 - 5..4096 + 5], got); + + let got = dma_read_file_range("test_file", 4096..4096 + 5) + .await + .unwrap(); + let got = got.0[got.1].to_vec(); + assert_eq!(&want[4096..4096 + 5], got); + + let got = dma_read_file_range("test_file", 4096 * 2 - 5..4096 * 2) + .await + .unwrap(); + let got = got.0[got.1].to_vec(); + assert_eq!(&want[4096 * 2 - 5..4096 * 2], got); + + let _ = std::fs::remove_file("test_file"); } } diff --git a/src/common/cache/src/lib.rs b/src/common/cache/src/lib.rs index 78b060e9963d..6f2de81950f0 100644 --- a/src/common/cache/src/lib.rs +++ b/src/common/cache/src/lib.rs @@ -16,9 +16,13 @@ #![allow(clippy::uninlined_format_args)] mod cache; +mod disk; mod mem_sized; pub use cache::lru::LruCache; pub use cache::Cache; +pub use disk::dma::dma_read_file; +pub use disk::dma::dma_read_file_range; +pub use disk::dma::dma_write_file_vectored; pub use hashbrown::hash_map::DefaultHashBuilder; pub use mem_sized::MemSized; diff --git a/src/query/pipeline/transforms/Cargo.toml b/src/query/pipeline/transforms/Cargo.toml index 0660bbb611b6..255812696780 100644 --- a/src/query/pipeline/transforms/Cargo.toml +++ b/src/query/pipeline/transforms/Cargo.toml @@ -18,7 +18,6 @@ databend-common-exception = { workspace = true } databend-common-expression = { workspace = true } databend-common-pipeline-core = { workspace = true } jsonb = { workspace = true } -libc = "0.2.158" log = { workspace = true } match-template = { workspace = true } serde = { workspace = true } diff --git a/src/query/pipeline/transforms/src/processors/mod.rs b/src/query/pipeline/transforms/src/processors/mod.rs index 3edb3119a8b8..a9c76cf3f0a3 100644 --- a/src/query/pipeline/transforms/src/processors/mod.rs +++ b/src/query/pipeline/transforms/src/processors/mod.rs @@ -12,8 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -mod dma; mod transforms; -mod spiller_disk; pub use transforms::*; diff --git a/src/query/pipeline/transforms/src/processors/spiller_disk.rs b/src/query/pipeline/transforms/src/processors/spiller_disk.rs deleted file mode 100644 index 34c7654ce937..000000000000 --- a/src/query/pipeline/transforms/src/processors/spiller_disk.rs +++ /dev/null @@ -1,70 +0,0 @@ -// Copyright 2021 Datafuse Labs -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::collections::HashMap; -use std::io::IoSlice; -use std::path::PathBuf; - -use databend_common_base::base::GlobalUniqName; -use databend_common_exception::ErrorCode; -use databend_common_exception::Result; -use databend_common_expression::arrow::serialize_column; -use databend_common_expression::DataBlock; - -pub struct DiskSpiller { - root: PathBuf, - - max_size: usize, - /// Record columns layout for spilled data, will be used when read data from disk - pub columns_layout: HashMap>, -} - -impl DiskSpiller { - /// Write a [`DataBlock`] to storage. - pub async fn spill_block(&mut self, data: DataBlock) -> Result { - let unique_name = GlobalUniqName::unique(); - - let location = self.root.join(unique_name); - - // let file = DmaFile::create(location.as_path()) - // .await - // .map_err(error_from_glommio)?; - // let mut writer = DmaStreamWriterBuilder::new(file).build(); - - // let data = data.convert_to_full(); - // let columns = data.columns(); - - // let location = location.as_os_str().to_str().unwrap().to_string(); - - // let columns_data = columns - // .iter() - // .map(|entry| serialize_column(entry.value.as_column().unwrap())) - // .collect::>(); - // let layouts = columns_data - // .iter() - // .map(|bytes| bytes.len()) - // .collect::>(); - // self.columns_layout.insert(location.clone(), layouts); - // let bufs = columns_data - // .iter() - // .map(|data| IoSlice::new(&data)) - // .collect::>(); - // writer.write_vectored(&bufs).await?; - // writer.close().await?; - - todo!(); - - // Ok(location) - } -} diff --git a/src/query/service/src/pipelines/builders/builder_window.rs b/src/query/service/src/pipelines/builders/builder_window.rs index f81b7df0c6e4..0205feb01f95 100644 --- a/src/query/service/src/pipelines/builders/builder_window.rs +++ b/src/query/service/src/pipelines/builders/builder_window.rs @@ -185,6 +185,7 @@ impl PipelineBuilder { input, output, operator.clone(), + None, location_prefix.clone(), ), )) diff --git a/src/query/service/src/pipelines/processors/transforms/window/partition_by/transform_window_partition_spill_reader.rs b/src/query/service/src/pipelines/processors/transforms/window/partition_by/transform_window_partition_spill_reader.rs index 732a2d94c8ee..1c3ff3ca0fd9 100644 --- a/src/query/service/src/pipelines/processors/transforms/window/partition_by/transform_window_partition_spill_reader.rs +++ b/src/query/service/src/pipelines/processors/transforms/window/partition_by/transform_window_partition_spill_reader.rs @@ -20,6 +20,7 @@ use std::time::Instant; use databend_common_base::runtime::profile::Profile; use databend_common_base::runtime::profile::ProfileStatisticsName; +use databend_common_cache::dma_read_file_range; use databend_common_exception::ErrorCode; use databend_common_exception::Result; use databend_common_expression::arrow::deserialize_column; @@ -31,13 +32,13 @@ use databend_common_pipeline_core::processors::InputPort; use databend_common_pipeline_core::processors::OutputPort; use databend_common_pipeline_core::processors::Processor; use databend_common_pipeline_core::processors::ProcessorPtr; -use itertools::Itertools; use log::info; use opendal::Operator; +use super::BucketSpilledWindowPayload; +use super::Location; use super::WindowPartitionMeta; use super::WindowPayload; -use crate::pipelines::processors::transforms::window::partition_by::BucketSpilledWindowPayload; type DeserializingMeta = (WindowPartitionMeta, VecDeque>); @@ -130,16 +131,12 @@ impl Processor for TransformWindowPartitionSpillReader { let mut new_data = Vec::with_capacity(data.len()); for meta in data { - if matches!(&meta, WindowPartitionMeta::BucketSpilled(_)) { - if let WindowPartitionMeta::BucketSpilled(p) = meta { - let data = read_data.pop_front().unwrap(); - new_data.push(Self::deserialize(p, data)); - } - - continue; + if let WindowPartitionMeta::BucketSpilled(p) = meta { + let data = read_data.pop_front().unwrap(); + new_data.push(Self::deserialize(p, data)); + } else { + new_data.push(meta); } - - new_data.push(meta); } self.deserialized_meta = @@ -159,77 +156,11 @@ impl Processor for TransformWindowPartitionSpillReader { WindowPartitionMeta::Spilling(_) => unreachable!(), WindowPartitionMeta::BucketSpilled(_) => unreachable!(), WindowPartitionMeta::Payload(_) => unreachable!(), - WindowPartitionMeta::Partitioned { data, .. } => { - let mut total_elapsed = Duration::default(); - let log_interval = 100; - let mut processed_count = 0; - - let mut read_data = Vec::with_capacity(data.len()); - for meta in data { - if let WindowPartitionMeta::BucketSpilled(p) = meta { - let location = p.location.clone(); - let operator = self.operator.clone(); - let data_range = p.data_range.clone(); - read_data.push(databend_common_base::runtime::spawn(async move { - let instant = Instant::now(); - let data = operator - .read_with(&location) - .range(data_range) - .await? - .to_vec(); - - // perf - { - Profile::record_usize_profile( - ProfileStatisticsName::SpillReadCount, - 1, - ); - Profile::record_usize_profile( - ProfileStatisticsName::SpillReadBytes, - data.len(), - ); - Profile::record_usize_profile( - ProfileStatisticsName::SpillReadTime, - instant.elapsed().as_millis() as usize, - ); - } - - total_elapsed += instant.elapsed(); - processed_count += 1; - - // log the progress - if processed_count % log_interval == 0 { - info!( - "Read window partition {}/{} spilled buckets, elapsed: {:?}", - processed_count, - data.len(), - total_elapsed - ); - } - - Ok(data) - })); - } - } - - match futures::future::try_join_all(read_data).await { - Err(_) => { - return Err(ErrorCode::TokioError("Cannot join tokio job")); - } - Ok(read_data) => { - let read_data: std::result::Result>, opendal::Error> = - read_data.into_iter().try_collect(); - - self.deserializing_meta = Some((block_meta, read_data?)); - } - }; - - if processed_count > 0 { - info!( - "Read {} window partition spills successfully, total elapsed: {:?}", - processed_count, total_elapsed - ); - } + WindowPartitionMeta::Partitioned { + data: data_meta, .. + } => { + let data = self.load_bytes(data_meta).await?; + self.deserializing_meta = Some((block_meta, data)); } } } @@ -270,4 +201,78 @@ impl TransformWindowPartitionSpillReader { data: DataBlock::new_from_columns(columns), }) } + + async fn load_bytes(&mut self, data_meta: &[WindowPartitionMeta]) -> Result>> { + let mut total_elapsed = Duration::default(); + let log_interval = 100; + let mut processed_count = 0; + + let load_jobs = data_meta + .iter() + .filter_map(|meta| { + if let WindowPartitionMeta::BucketSpilled(p) = meta { + let location = p.location.clone(); + let operator = self.operator.clone(); + let data_range = p.data_range.clone(); + + Some(databend_common_base::runtime::spawn(async move { + let instant = Instant::now(); + + let data = match location { + Location::Storage(path) => { + operator.read_with(&path).range(data_range).await?.to_vec() + } + Location::Disk(path) => { + let (buf, range) = dma_read_file_range(path, data_range).await?; + buf[range].to_vec() + } + }; + + // perf + { + Profile::record_usize_profile(ProfileStatisticsName::SpillReadCount, 1); + Profile::record_usize_profile( + ProfileStatisticsName::SpillReadBytes, + data.len(), + ); + Profile::record_usize_profile( + ProfileStatisticsName::SpillReadTime, + instant.elapsed().as_millis() as usize, + ); + } + + total_elapsed += instant.elapsed(); + processed_count += 1; + + // log the progress + if processed_count % log_interval == 0 { + info!( + "Read window partition {}/{} spilled buckets, elapsed: {:?}", + processed_count, + data.len(), + total_elapsed, + ); + } + Ok::<_, ErrorCode>(data) + })) + } else { + None + } + }) + .collect::>(); + + let data = match futures::future::try_join_all(load_jobs).await { + Err(_) => { + return Err(ErrorCode::TokioError("Cannot join tokio job")); + } + Ok(data) => data.into_iter().collect::>()?, + }; + + if processed_count > 0 { + info!( + "Read {processed_count} window partition spills successfully, total elapsed: {total_elapsed:?}", + ); + } + Ok(data) + } } diff --git a/src/query/service/src/pipelines/processors/transforms/window/partition_by/transform_window_partition_spill_writer.rs b/src/query/service/src/pipelines/processors/transforms/window/partition_by/transform_window_partition_spill_writer.rs index 2eb678da77f8..a58f8953cdc3 100644 --- a/src/query/service/src/pipelines/processors/transforms/window/partition_by/transform_window_partition_spill_writer.rs +++ b/src/query/service/src/pipelines/processors/transforms/window/partition_by/transform_window_partition_spill_writer.rs @@ -13,6 +13,9 @@ // limitations under the License. use std::any::Any; +use std::io; +use std::path::Path; +use std::path::PathBuf; use std::sync::Arc; use std::time::Instant; @@ -20,6 +23,7 @@ use databend_common_base::base::GlobalUniqName; use databend_common_base::base::ProgressValues; use databend_common_base::runtime::profile::Profile; use databend_common_base::runtime::profile::ProfileStatisticsName; +use databend_common_cache::dma_write_file_vectored; use databend_common_catalog::table_context::TableContext; use databend_common_exception::ErrorCode; use databend_common_exception::Result; @@ -36,6 +40,7 @@ use opendal::Operator; use super::convert_to_partitions; use super::BucketSpilledWindowPayload; +use super::Location; use super::WindowPartitionMeta; use crate::pipelines::processors::transforms::window::partition_by::SpillingWindowPayloads; use crate::pipelines::processors::transforms::window::partition_by::PARTITION_COUNT; @@ -48,6 +53,7 @@ pub struct TransformWindowPartitionSpillWriter { operator: Operator, location_prefix: String, + disk: Option, spilled_block: Option, spilling_meta: Option, spilling_future: Option>>, @@ -59,6 +65,7 @@ impl TransformWindowPartitionSpillWriter { input: Arc, output: Arc, operator: Operator, + disk: Option, location_prefix: String, ) -> Box { Box::new(TransformWindowPartitionSpillWriter { @@ -66,6 +73,7 @@ impl TransformWindowPartitionSpillWriter { input, output, operator, + disk, location_prefix, spilled_block: None, spilling_meta: None, @@ -74,6 +82,11 @@ impl TransformWindowPartitionSpillWriter { } } +pub struct DiskConfig { + pub root: PathBuf, + pub bytes_limit: usize, +} + #[async_trait::async_trait] impl Processor for TransformWindowPartitionSpillWriter { fn name(&self) -> String { @@ -148,6 +161,8 @@ impl Processor for TransformWindowPartitionSpillWriter { self.ctx.clone(), self.operator.clone(), &self.location_prefix, + self.disk.as_mut(), + GlobalUniqName::unique(), payload, )?); @@ -179,66 +194,80 @@ pub fn spilling_window_payload( ctx: Arc, operator: Operator, location_prefix: &str, + disk: Option<&mut DiskConfig>, + unique_name: String, payload: SpillingWindowPayloads, ) -> Result>> { let partitions = convert_to_partitions(payload.data)?; - let unique_name = GlobalUniqName::unique(); - let location = format!("{}/{}", location_prefix, unique_name); - - let mut write_size = 0; - let mut write_data = Vec::with_capacity(PARTITION_COUNT); - let mut spilled_buckets_payloads = Vec::with_capacity(PARTITION_COUNT); let mut rows = 0; + let mut write_size: u64 = 0; + let mut write_data = Vec::with_capacity(PARTITION_COUNT); - for (bucket, block) in partitions.into_iter() { - if block.is_empty() { - continue; - } + let partitions = partitions + .into_iter() + .filter_map(|(bucket, block)| { + if block.is_empty() { + return None; + } + rows += block.num_rows(); + + let columns_data = block + .columns() + .iter() + .map(|entry| { + let column = entry + .value + .convert_to_full_column(&entry.data_type, block.num_rows()); + serialize_column(&column) + }) + .collect::>(); + + let columns_layout = columns_data + .iter() + .map(|data| data.len() as u64) + .collect::>(); + + write_data.push(columns_data); + + let begin = write_size; + write_size += columns_layout.iter().sum::(); + + Some((bucket, columns_layout, begin..write_size)) + }) + .collect::>(); - rows += block.num_rows(); - - let begin = write_size; - let columns = block.columns().to_vec(); - let mut columns_data = Vec::with_capacity(columns.len()); - let mut columns_layout = Vec::with_capacity(columns.len()); - for column in columns.into_iter() { - let column = column - .value - .convert_to_full_column(&column.data_type, block.num_rows()); - let column_data = serialize_column(&column); - write_size += column_data.len() as u64; - columns_layout.push(column_data.len() as u64); - columns_data.push(column_data); + let location = match disk { + Some(disk) if disk.bytes_limit as u64 >= write_size => { + disk.bytes_limit -= write_size as usize; + Location::Disk(disk.root.join(unique_name)) } - - write_data.push(columns_data); - spilled_buckets_payloads.push(BucketSpilledWindowPayload { - bucket, - location: location.clone(), - data_range: begin..write_size, - columns_layout, - }); - } - - Ok(Box::pin(async move { + _ => Location::Storage(format!("{location_prefix}/{unique_name}")), + }; + + let spilled_buckets_payloads = partitions + .into_iter() + .map( + |(bucket, columns_layout, data_range)| BucketSpilledWindowPayload { + bucket, + location: location.clone(), + data_range, + columns_layout, + }, + ) + .collect::>(); + + let future = Box::pin(async move { let instant = Instant::now(); - let mut write_bytes = 0; - if !write_data.is_empty() { - let mut writer = operator - .writer_with(&location) - .chunk(8 * 1024 * 1024) - .await?; - for write_bucket_data in write_data.into_iter() { - for data in write_bucket_data.into_iter() { - write_bytes += data.len(); - writer.write(data).await?; - } + let write_bytes = if write_data.is_empty() { + 0 + } else { + match &location { + Location::Storage(path) => write_to_storage(&operator, path, write_data).await?, + Location::Disk(path) => write_to_disk(path, write_data).await?, } - - writer.close().await?; - } + }; // perf { @@ -260,13 +289,40 @@ pub fn spilling_window_payload( } info!( - "Write window partition spill {} successfully, elapsed: {:?}", - location, + "Write window partition spill {location:?} successfully, elapsed: {:?}", instant.elapsed() ); Ok(DataBlock::empty_with_meta( WindowPartitionMeta::create_spilled(spilled_buckets_payloads), )) - })) + }); + Ok(future) +} + +async fn write_to_storage( + operator: &Operator, + path: &str, + write_data: Vec>>, +) -> Result { + let mut writer = operator.writer_with(path).chunk(8 * 1024 * 1024).await?; + + let mut writen = 0; + for data in write_data.into_iter().flatten() { + writen += data.len(); + writer.write(data).await?; + } + + writer.close().await?; + Ok(writen) +} + +async fn write_to_disk(path: impl AsRef, write_data: Vec>>) -> io::Result { + let bufs = write_data + .iter() + .flatten() + .map(|data| io::IoSlice::new(data)) + .collect::>(); + + dma_write_file_vectored(path, &bufs).await } diff --git a/src/query/service/src/pipelines/processors/transforms/window/partition_by/window_partition_meta.rs b/src/query/service/src/pipelines/processors/transforms/window/partition_by/window_partition_meta.rs index 8dfe0d9e47a2..0cbd35f9d63d 100644 --- a/src/query/service/src/pipelines/processors/transforms/window/partition_by/window_partition_meta.rs +++ b/src/query/service/src/pipelines/processors/transforms/window/partition_by/window_partition_meta.rs @@ -16,6 +16,7 @@ use std::collections::BTreeMap; use std::fmt::Debug; use std::fmt::Formatter; use std::ops::Range; +use std::path::PathBuf; use databend_common_expression::BlockMetaInfo; use databend_common_expression::BlockMetaInfoPtr; @@ -32,11 +33,17 @@ pub struct SpillingWindowPayloads { pub struct BucketSpilledWindowPayload { pub bucket: isize, - pub location: String, + pub location: Location, pub data_range: Range, pub columns_layout: Vec, } +#[derive(Debug, Clone)] +pub enum Location { + Storage(String), + Disk(PathBuf), +} + pub enum WindowPartitionMeta { Spilling(SpillingWindowPayloads), Spilled(Vec), From c3432de5f35c4cd8a249368aa391091fecaca0f3 Mon Sep 17 00:00:00 2001 From: coldWater Date: Wed, 11 Sep 2024 21:06:46 +0800 Subject: [PATCH 03/40] fix Signed-off-by: coldWater --- src/common/cache/src/disk/dma.rs | 12 ++++++------ .../transform_window_partition_spill_writer.rs | 6 +++--- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/src/common/cache/src/disk/dma.rs b/src/common/cache/src/disk/dma.rs index 78d0b7b71be7..6a3d788de153 100644 --- a/src/common/cache/src/disk/dma.rs +++ b/src/common/cache/src/disk/dma.rs @@ -69,7 +69,7 @@ impl DmaBuffer { self.cap } - /// Returns the remining capacity in the buffer. + /// Returns the remaining capacity in the buffer. pub fn remaining(&self) -> usize { self.capacity() - self.len() } @@ -116,7 +116,7 @@ impl Drop for DmaBuffer { } } -/// A `DmaFile` is similar to a `File`, but it is openened with the `O_DIRECT` file in order to +/// A `DmaFile` is similar to a `File`, but it is opened with the `O_DIRECT` file in order to /// perform direct IO. struct DmaFile { fd: File, @@ -153,18 +153,18 @@ impl DmaFile { self.buf = Some(buf) } - /// Aligns `value` up to the memory alignement requirement for this file. + /// Aligns `value` up to the memory alignment requirement for this file. pub fn align_up(&self, value: usize) -> usize { align_up(self.alignment, value) } - /// Aligns `value` down to the memory alignement requirement for this file. + /// Aligns `value` down to the memory alignment requirement for this file. #[allow(dead_code)] pub fn align_down(&self, value: usize) -> usize { align_down(self.alignment, value) } - /// Return the alignement requirement for this file. The returned alignement value can be used + /// Return the alignment requirement for this file. The returned alignment value can be used /// to allocate a buffer to use with this file: #[allow(dead_code)] pub fn alignment(&self) -> usize { @@ -227,7 +227,7 @@ pub fn align_down(alignment: usize, value: usize) -> usize { async fn open_dma(file: File) -> io::Result { let statfs = fstatfs(&file).await?; - // TODO: the actual aligment may differ from the optimal io size? we should probably get + // TODO: the actual alignment may differ from the optimal io size? we should probably get // this information from the the device the file lives on. let alignment = statfs.f_bsize.max(512) as usize; diff --git a/src/query/service/src/pipelines/processors/transforms/window/partition_by/transform_window_partition_spill_writer.rs b/src/query/service/src/pipelines/processors/transforms/window/partition_by/transform_window_partition_spill_writer.rs index a58f8953cdc3..863f292e2318 100644 --- a/src/query/service/src/pipelines/processors/transforms/window/partition_by/transform_window_partition_spill_writer.rs +++ b/src/query/service/src/pipelines/processors/transforms/window/partition_by/transform_window_partition_spill_writer.rs @@ -307,14 +307,14 @@ async fn write_to_storage( ) -> Result { let mut writer = operator.writer_with(path).chunk(8 * 1024 * 1024).await?; - let mut writen = 0; + let mut written = 0; for data in write_data.into_iter().flatten() { - writen += data.len(); + written += data.len(); writer.write(data).await?; } writer.close().await?; - Ok(writen) + Ok(written) } async fn write_to_disk(path: impl AsRef, write_data: Vec>>) -> io::Result { From ebe64bce47d4417cef3992b74d59252805b1225a Mon Sep 17 00:00:00 2001 From: coldWater Date: Thu, 12 Sep 2024 00:07:11 +0800 Subject: [PATCH 04/40] refine Signed-off-by: coldWater --- src/common/cache/src/disk/dma.rs | 27 ++-- .../transform_window_partition_sort.rs | 10 +- ...transform_window_partition_spill_reader.rs | 143 ++++++++---------- .../partition_by/window_partition_meta.rs | 13 +- 4 files changed, 87 insertions(+), 106 deletions(-) diff --git a/src/common/cache/src/disk/dma.rs b/src/common/cache/src/disk/dma.rs index 6a3d788de153..9ce0744cd829 100644 --- a/src/common/cache/src/disk/dma.rs +++ b/src/common/cache/src/disk/dma.rs @@ -195,7 +195,6 @@ impl DmaFile { let buf = self.mut_buffer(); let rt = unsafe { libc::read(fd, buf.as_mut_ptr().cast(), buf.capacity()) }; if rt >= 0 { - debug_assert_eq!(buf.capacity(), rt as usize); unsafe { buf.set_len(rt as usize) } Ok(rt as usize) } else { @@ -387,59 +386,59 @@ mod tests { } async fn run_test(n: usize) -> io::Result<()> { + let filename = "test_file"; let want = (0..n).map(|i| (i % 256) as u8).collect::>(); let bufs = vec![IoSlice::new(&want)]; - let length = dma_write_file_vectored("test_file", &bufs).await?; + let length = dma_write_file_vectored(filename, &bufs).await?; assert_eq!(length, want.len()); let mut got = Vec::new(); - let length = dma_read_file("test_file", &mut got).await?; + let length = dma_read_file(filename, &mut got).await?; assert_eq!(length, want.len()); assert_eq!(got, want); - std::fs::remove_file("test_file")?; + std::fs::remove_file(filename)?; Ok(()) } #[tokio::test] async fn test_range_read() { - let _ = std::fs::remove_file("test_file"); + let filename = "test_file2"; + let _ = std::fs::remove_file(filename); let n: usize = 4096 * 2; let want = (0..n).map(|i| (i % 256) as u8).collect::>(); let bufs = vec![IoSlice::new(&want)]; - dma_write_file_vectored("test_file", &bufs).await.unwrap(); + dma_write_file_vectored(filename, &bufs).await.unwrap(); - let got = dma_read_file_range("test_file", 0..10).await.unwrap(); + let got = dma_read_file_range(filename, 0..10).await.unwrap(); let got = got.0[got.1].to_vec(); assert_eq!(&want[0..10], got); - let got = dma_read_file_range("test_file", 10..30).await.unwrap(); + let got = dma_read_file_range(filename, 10..30).await.unwrap(); let got = got.0[got.1].to_vec(); assert_eq!(&want[10..30], got); - let got = dma_read_file_range("test_file", 4096 - 5..4096 + 5) + let got = dma_read_file_range(filename, 4096 - 5..4096 + 5) .await .unwrap(); let got = got.0[got.1].to_vec(); assert_eq!(&want[4096 - 5..4096 + 5], got); - let got = dma_read_file_range("test_file", 4096..4096 + 5) - .await - .unwrap(); + let got = dma_read_file_range(filename, 4096..4096 + 5).await.unwrap(); let got = got.0[got.1].to_vec(); assert_eq!(&want[4096..4096 + 5], got); - let got = dma_read_file_range("test_file", 4096 * 2 - 5..4096 * 2) + let got = dma_read_file_range(filename, 4096 * 2 - 5..4096 * 2) .await .unwrap(); let got = got.0[got.1].to_vec(); assert_eq!(&want[4096 * 2 - 5..4096 * 2], got); - let _ = std::fs::remove_file("test_file"); + let _ = std::fs::remove_file(filename); } } diff --git a/src/query/service/src/pipelines/processors/transforms/window/partition_by/transform_window_partition_sort.rs b/src/query/service/src/pipelines/processors/transforms/window/partition_by/transform_window_partition_sort.rs index 6f27c2da3545..b44b30b2ec44 100644 --- a/src/query/service/src/pipelines/processors/transforms/window/partition_by/transform_window_partition_sort.rs +++ b/src/query/service/src/pipelines/processors/transforms/window/partition_by/transform_window_partition_sort.rs @@ -26,7 +26,8 @@ use databend_common_pipeline_transforms::processors::sort_merge; use databend_common_pipeline_transforms::processors::BlockMetaTransform; use databend_common_pipeline_transforms::processors::BlockMetaTransformer; -use crate::pipelines::processors::transforms::WindowPartitionMeta; +use super::Partitioned; +use super::WindowPartitionMeta; pub struct TransformWindowPartitionSort { sort_desc: Vec, @@ -67,19 +68,16 @@ impl BlockMetaTransform for TransformWindowPartitionSort { const NAME: &'static str = "TransformWindowPartitionSort"; fn transform(&mut self, meta: WindowPartitionMeta) -> Result> { - if let WindowPartitionMeta::Partitioned { bucket, data } = meta { + if let WindowPartitionMeta::Partitioned(Partitioned { bucket, data }) = meta { let mut sort_blocks = Vec::with_capacity(data.len()); for bucket_data in data { match bucket_data { - WindowPartitionMeta::Spilled(_) => unreachable!(), - WindowPartitionMeta::BucketSpilled(_) => unreachable!(), - WindowPartitionMeta::Partitioned { .. } => unreachable!(), - WindowPartitionMeta::Spilling(_) => unreachable!(), WindowPartitionMeta::Payload(p) => { debug_assert!(bucket == p.bucket); let sort_block = DataBlock::sort(&p.data, &self.sort_desc, None)?; sort_blocks.push(sort_block); } + _ => unreachable!(), } } diff --git a/src/query/service/src/pipelines/processors/transforms/window/partition_by/transform_window_partition_spill_reader.rs b/src/query/service/src/pipelines/processors/transforms/window/partition_by/transform_window_partition_spill_reader.rs index 1c3ff3ca0fd9..544a3b7e1f2b 100644 --- a/src/query/service/src/pipelines/processors/transforms/window/partition_by/transform_window_partition_spill_reader.rs +++ b/src/query/service/src/pipelines/processors/transforms/window/partition_by/transform_window_partition_spill_reader.rs @@ -37,19 +37,17 @@ use opendal::Operator; use super::BucketSpilledWindowPayload; use super::Location; +use super::Partitioned; use super::WindowPartitionMeta; use super::WindowPayload; -type DeserializingMeta = (WindowPartitionMeta, VecDeque>); - pub struct TransformWindowPartitionSpillReader { input: Arc, output: Arc, operator: Operator, deserialized_meta: Option, - reading_meta: Option, - deserializing_meta: Option, + reading_meta: Option, } #[async_trait::async_trait] @@ -79,11 +77,6 @@ impl Processor for TransformWindowPartitionSpillReader { return Ok(Event::NeedConsume); } - if self.deserializing_meta.is_some() { - self.input.set_not_need_data(); - return Ok(Event::Sync); - } - if self.reading_meta.is_some() { self.input.set_not_need_data(); return Ok(Event::Async); @@ -92,7 +85,7 @@ impl Processor for TransformWindowPartitionSpillReader { if self.input.has_data() { let mut data_block = self.input.pull_data().unwrap()?; - if let Some(WindowPartitionMeta::Partitioned { data, .. }) = data_block + if let Some(WindowPartitionMeta::Partitioned(Partitioned { data, .. })) = data_block .get_meta() .and_then(WindowPartitionMeta::downcast_ref_from) { @@ -101,8 +94,12 @@ impl Processor for TransformWindowPartitionSpillReader { .any(|meta| matches!(meta, WindowPartitionMeta::BucketSpilled(_))) { self.input.set_not_need_data(); - let block_meta = data_block.take_meta().unwrap(); - self.reading_meta = WindowPartitionMeta::downcast_from(block_meta); + let meta = WindowPartitionMeta::downcast_from(data_block.take_meta().unwrap()); + if let Some(WindowPartitionMeta::Partitioned(partitioned)) = meta { + self.reading_meta = Some(partitioned) + } else { + unreachable!() + } return Ok(Event::Async); } } @@ -120,50 +117,23 @@ impl Processor for TransformWindowPartitionSpillReader { Ok(Event::NeedData) } - fn process(&mut self) -> Result<()> { - if let Some((meta, mut read_data)) = self.deserializing_meta.take() { - match meta { - WindowPartitionMeta::Spilled(_) => unreachable!(), - WindowPartitionMeta::Spilling(_) => unreachable!(), - WindowPartitionMeta::BucketSpilled(_) => unreachable!(), - WindowPartitionMeta::Payload(_) => unreachable!(), - WindowPartitionMeta::Partitioned { bucket, data } => { - let mut new_data = Vec::with_capacity(data.len()); - - for meta in data { - if let WindowPartitionMeta::BucketSpilled(p) = meta { - let data = read_data.pop_front().unwrap(); - new_data.push(Self::deserialize(p, data)); - } else { - new_data.push(meta); - } - } - - self.deserialized_meta = - Some(WindowPartitionMeta::create_partitioned(bucket, new_data)); - } - } - } - - Ok(()) - } - #[async_backtrace::framed] async fn async_process(&mut self) -> Result<()> { - if let Some(block_meta) = self.reading_meta.take() { - match &block_meta { - WindowPartitionMeta::Spilled(_) => unreachable!(), - WindowPartitionMeta::Spilling(_) => unreachable!(), - WindowPartitionMeta::BucketSpilled(_) => unreachable!(), - WindowPartitionMeta::Payload(_) => unreachable!(), - WindowPartitionMeta::Partitioned { - data: data_meta, .. - } => { - let data = self.load_bytes(data_meta).await?; - self.deserializing_meta = Some((block_meta, data)); + let Partitioned { bucket, data } = self.reading_meta.take().unwrap(); + let mut blocks = self.load_blocks(&data).await?; + + let new_data = data + .into_iter() + .map(|meta| { + if let WindowPartitionMeta::BucketSpilled(_) = meta { + let data = blocks.pop_front().unwrap(); + WindowPartitionMeta::Payload(WindowPayload { bucket, data }) + } else { + meta } - } - } + }) + .collect::<_>(); + self.deserialized_meta = Some(WindowPartitionMeta::create_partitioned(bucket, new_data)); Ok(()) } @@ -182,49 +152,43 @@ impl TransformWindowPartitionSpillReader { operator, deserialized_meta: None, reading_meta: None, - deserializing_meta: None, }, ))) } - fn deserialize(payload: BucketSpilledWindowPayload, data: Vec) -> WindowPartitionMeta { - let mut begin = 0; - let mut columns = Vec::with_capacity(payload.columns_layout.len()); - - for column_layout in payload.columns_layout { - columns.push(deserialize_column(&data[begin..begin + column_layout as usize]).unwrap()); - begin += column_layout as usize; - } - - WindowPartitionMeta::Payload(WindowPayload { - bucket: payload.bucket, - data: DataBlock::new_from_columns(columns), - }) - } - - async fn load_bytes(&mut self, data_meta: &[WindowPartitionMeta]) -> Result>> { + async fn load_blocks(&mut self, data: &[WindowPartitionMeta]) -> Result> { let mut total_elapsed = Duration::default(); let log_interval = 100; let mut processed_count = 0; - let load_jobs = data_meta + let jobs = data .iter() .filter_map(|meta| { - if let WindowPartitionMeta::BucketSpilled(p) = meta { - let location = p.location.clone(); + if let WindowPartitionMeta::BucketSpilled(payload) = meta { let operator = self.operator.clone(); - let data_range = p.data_range.clone(); + let BucketSpilledWindowPayload { + location, + data_range, + columns_layout, + .. + } = payload.clone(); Some(databend_common_base::runtime::spawn(async move { let instant = Instant::now(); - let data = match location { + let (block, data_size) = match location { Location::Storage(path) => { - operator.read_with(&path).range(data_range).await?.to_vec() + let data = operator + .read_with(&path) + .range(data_range) + .await? + .to_bytes(); + (deserialize_block(&columns_layout, &data), data.len()) } Location::Disk(path) => { let (buf, range) = dma_read_file_range(path, data_range).await?; - buf[range].to_vec() + let data = &buf[range]; + (deserialize_block(&columns_layout, data), data.len()) } }; @@ -233,7 +197,7 @@ impl TransformWindowPartitionSpillReader { Profile::record_usize_profile(ProfileStatisticsName::SpillReadCount, 1); Profile::record_usize_profile( ProfileStatisticsName::SpillReadBytes, - data.len(), + data_size, ); Profile::record_usize_profile( ProfileStatisticsName::SpillReadTime, @@ -248,12 +212,10 @@ impl TransformWindowPartitionSpillReader { if processed_count % log_interval == 0 { info!( "Read window partition {}/{} spilled buckets, elapsed: {:?}", - processed_count, - data.len(), - total_elapsed, + processed_count, data_size, total_elapsed, ); } - Ok::<_, ErrorCode>(data) + Ok::<_, ErrorCode>(block) })) } else { None @@ -261,7 +223,7 @@ impl TransformWindowPartitionSpillReader { }) .collect::>(); - let data = match futures::future::try_join_all(load_jobs).await { + let blocks = match futures::future::try_join_all(jobs).await { Err(_) => { return Err(ErrorCode::TokioError("Cannot join tokio job")); } @@ -273,6 +235,19 @@ impl TransformWindowPartitionSpillReader { "Read {processed_count} window partition spills successfully, total elapsed: {total_elapsed:?}", ); } - Ok(data) + Ok(blocks) } } + +pub fn deserialize_block(columns_layout: &[u64], mut data: &[u8]) -> DataBlock { + let columns = columns_layout + .iter() + .map(|layout| { + let (cur, remain) = data.split_at(*layout as usize); + data = remain; + deserialize_column(cur).unwrap() + }) + .collect::>(); + + DataBlock::new_from_columns(columns) +} diff --git a/src/query/service/src/pipelines/processors/transforms/window/partition_by/window_partition_meta.rs b/src/query/service/src/pipelines/processors/transforms/window/partition_by/window_partition_meta.rs index 0cbd35f9d63d..2de287e17425 100644 --- a/src/query/service/src/pipelines/processors/transforms/window/partition_by/window_partition_meta.rs +++ b/src/query/service/src/pipelines/processors/transforms/window/partition_by/window_partition_meta.rs @@ -31,6 +31,7 @@ pub struct SpillingWindowPayloads { pub data: BTreeMap>, } +#[derive(Clone)] pub struct BucketSpilledWindowPayload { pub bucket: isize, pub location: Location, @@ -50,7 +51,12 @@ pub enum WindowPartitionMeta { BucketSpilled(BucketSpilledWindowPayload), Payload(WindowPayload), - Partitioned { bucket: isize, data: Vec }, + Partitioned(Partitioned), +} + +pub struct Partitioned { + pub bucket: isize, + pub data: Vec, } impl WindowPartitionMeta { @@ -73,7 +79,10 @@ impl WindowPartitionMeta { } pub fn create_partitioned(bucket: isize, data: Vec) -> BlockMetaInfoPtr { - Box::new(WindowPartitionMeta::Partitioned { bucket, data }) + Box::new(WindowPartitionMeta::Partitioned(Partitioned { + bucket, + data, + })) } } From 0e0184dd0c856a469b8e2e118c022e2ef8b86f8e Mon Sep 17 00:00:00 2001 From: coldWater Date: Thu, 12 Sep 2024 13:29:55 +0800 Subject: [PATCH 05/40] temp file Signed-off-by: coldWater --- .../src/pipelines/builders/builder_window.rs | 14 ++++++++- ...transform_window_partition_spill_writer.rs | 15 +++++----- src/query/storages/common/cache/src/lib.rs | 2 ++ .../storages/common/cache/src/manager.rs | 28 ++++++++++++++++- .../storages/common/cache/src/temp_file.rs | 30 +++++++++++++++++++ 5 files changed, 80 insertions(+), 9 deletions(-) create mode 100644 src/query/storages/common/cache/src/temp_file.rs diff --git a/src/query/service/src/pipelines/builders/builder_window.rs b/src/query/service/src/pipelines/builders/builder_window.rs index 0205feb01f95..2d6b9594ac96 100644 --- a/src/query/service/src/pipelines/builders/builder_window.rs +++ b/src/query/service/src/pipelines/builders/builder_window.rs @@ -26,7 +26,9 @@ use databend_common_pipeline_core::PipeItem; use databend_common_sql::executor::physical_plans::Window; use databend_common_sql::executor::physical_plans::WindowPartition; use databend_common_storage::DataOperator; +use databend_storages_common_cache::CacheManager; +use crate::pipelines::processors::transforms::DiskSpillConfig; use crate::pipelines::processors::transforms::FrameBound; use crate::pipelines::processors::transforms::TransformWindowPartitionBucket; use crate::pipelines::processors::transforms::TransformWindowPartitionScatter; @@ -176,8 +178,18 @@ impl PipelineBuilder { })?; let operator = DataOperator::instance().operator(); + let location_prefix = query_spill_prefix(self.ctx.get_tenant().tenant_name(), &self.ctx.get_id()); + + let disk_spill = + CacheManager::instance() + .get_temp_dir_config() + .map(|cfg| DiskSpillConfig { + root: cfg.path.join(self.ctx.get_id()), + bytes_limit: 1 << 20, // todo + }); + self.main_pipeline.add_transform(|input, output| { Ok(ProcessorPtr::create( TransformWindowPartitionSpillWriter::create( @@ -185,7 +197,7 @@ impl PipelineBuilder { input, output, operator.clone(), - None, + disk_spill.clone(), location_prefix.clone(), ), )) diff --git a/src/query/service/src/pipelines/processors/transforms/window/partition_by/transform_window_partition_spill_writer.rs b/src/query/service/src/pipelines/processors/transforms/window/partition_by/transform_window_partition_spill_writer.rs index 863f292e2318..5bc03e647866 100644 --- a/src/query/service/src/pipelines/processors/transforms/window/partition_by/transform_window_partition_spill_writer.rs +++ b/src/query/service/src/pipelines/processors/transforms/window/partition_by/transform_window_partition_spill_writer.rs @@ -53,7 +53,7 @@ pub struct TransformWindowPartitionSpillWriter { operator: Operator, location_prefix: String, - disk: Option, + disk_spill: Option, spilled_block: Option, spilling_meta: Option, spilling_future: Option>>, @@ -65,7 +65,7 @@ impl TransformWindowPartitionSpillWriter { input: Arc, output: Arc, operator: Operator, - disk: Option, + disk_spill: Option, location_prefix: String, ) -> Box { Box::new(TransformWindowPartitionSpillWriter { @@ -73,7 +73,7 @@ impl TransformWindowPartitionSpillWriter { input, output, operator, - disk, + disk_spill, location_prefix, spilled_block: None, spilling_meta: None, @@ -82,7 +82,8 @@ impl TransformWindowPartitionSpillWriter { } } -pub struct DiskConfig { +#[derive(Clone)] +pub struct DiskSpillConfig { pub root: PathBuf, pub bytes_limit: usize, } @@ -161,7 +162,7 @@ impl Processor for TransformWindowPartitionSpillWriter { self.ctx.clone(), self.operator.clone(), &self.location_prefix, - self.disk.as_mut(), + self.disk_spill.as_mut(), GlobalUniqName::unique(), payload, )?); @@ -194,7 +195,7 @@ pub fn spilling_window_payload( ctx: Arc, operator: Operator, location_prefix: &str, - disk: Option<&mut DiskConfig>, + disk_spill: Option<&mut DiskSpillConfig>, unique_name: String, payload: SpillingWindowPayloads, ) -> Result>> { @@ -237,7 +238,7 @@ pub fn spilling_window_payload( }) .collect::>(); - let location = match disk { + let location = match disk_spill { Some(disk) if disk.bytes_limit as u64 >= write_size => { disk.bytes_limit -= write_size as usize; Location::Disk(disk.root.join(unique_name)) diff --git a/src/query/storages/common/cache/src/lib.rs b/src/query/storages/common/cache/src/lib.rs index 0a7378134b71..84c1ba172f0f 100644 --- a/src/query/storages/common/cache/src/lib.rs +++ b/src/query/storages/common/cache/src/lib.rs @@ -20,6 +20,7 @@ mod caches; mod manager; mod providers; mod read; +mod temp_file; pub use cache::CacheAccessor; pub use cache::Unit; @@ -45,3 +46,4 @@ pub use read::InMemoryCacheReader; pub use read::InMemoryItemCacheReader; pub use read::LoadParams; pub use read::Loader; +pub use temp_file::TempDir; diff --git a/src/query/storages/common/cache/src/manager.rs b/src/query/storages/common/cache/src/manager.rs index 6553f64f30fd..d56345bca1c8 100644 --- a/src/query/storages/common/cache/src/manager.rs +++ b/src/query/storages/common/cache/src/manager.rs @@ -37,6 +37,7 @@ use crate::caches::TableSnapshotStatisticCache; use crate::InMemoryLruCache; use crate::TableDataCache; use crate::TableDataCacheBuilder; +use crate::TempDir; static DEFAULT_FILE_META_DATA_CACHE_ITEMS: usize = 3000; @@ -54,6 +55,7 @@ pub struct CacheManager { table_data_cache: Option, in_memory_table_data_cache: Option, block_meta_cache: Option, + temp_dir_config: Option, } impl CacheManager { @@ -63,13 +65,15 @@ impl CacheManager { max_server_memory_usage: &u64, tenant_id: impl Into, ) -> Result<()> { + let tenant_id = tenant_id.into(); + // setup table data cache let table_data_cache = { match config.data_cache_storage { CacheStorageTypeInnerConfig::None => None, CacheStorageTypeInnerConfig::Disk => { let real_disk_cache_root = PathBuf::from(&config.disk_cache_config.path) - .join(tenant_id.into()) + .join(tenant_id.clone()) .join("v1"); let queue_size: u32 = if config.table_data_cache_population_queue_size > 0 { @@ -99,6 +103,22 @@ impl CacheManager { } }; + let temp_dir_config = match config.data_cache_storage { + CacheStorageTypeInnerConfig::None => None, + CacheStorageTypeInnerConfig::Disk => { + let path = PathBuf::from(&config.disk_cache_config.path) + .join("temp") + .join(tenant_id.clone()); + + let temp_dir = TempDir { + path, + bytes_limit: config.disk_cache_config.max_bytes as usize, + }; + temp_dir.init()?; + Some(temp_dir) + } + }; + // setup in-memory table column cache let memory_cache_capacity = if config.table_data_deserialized_data_bytes != 0 { config.table_data_deserialized_data_bytes as usize @@ -127,6 +147,7 @@ impl CacheManager { table_data_cache, in_memory_table_data_cache, block_meta_cache: None, + temp_dir_config, })); } else { let table_snapshot_cache = Self::new_named_items_cache( @@ -194,6 +215,7 @@ impl CacheManager { table_data_cache, in_memory_table_data_cache, block_meta_cache, + temp_dir_config, })); } @@ -252,6 +274,10 @@ impl CacheManager { self.in_memory_table_data_cache.clone() } + pub fn get_temp_dir_config(&self) -> Option { + self.temp_dir_config.clone() + } + pub fn new_named_items_cache>>( capacity: usize, name: impl Into, diff --git a/src/query/storages/common/cache/src/temp_file.rs b/src/query/storages/common/cache/src/temp_file.rs new file mode 100644 index 000000000000..dfb4996a541c --- /dev/null +++ b/src/query/storages/common/cache/src/temp_file.rs @@ -0,0 +1,30 @@ +// Copyright 2021 Datafuse Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::path::PathBuf; + +use databend_common_exception::Result; + +#[derive(Clone)] +pub struct TempDir { + pub path: PathBuf, + pub bytes_limit: usize, +} + +impl TempDir { + pub fn init(&self) -> Result<()> { + let _ = std::fs::remove_dir_all(&self.path); + Ok(std::fs::create_dir_all(&self.path)?) + } +} From 3ac783e7a2a149e7f99eba4891d7a537e9ecdd14 Mon Sep 17 00:00:00 2001 From: coldWater Date: Thu, 12 Sep 2024 17:31:21 +0800 Subject: [PATCH 06/40] refactor spill Signed-off-by: coldWater --- .../src/pipelines/builders/builder_window.rs | 34 ++++---- ...transform_window_partition_spill_reader.rs | 15 +--- ...transform_window_partition_spill_writer.rs | 41 +++------ .../partition_by/window_partition_meta.rs | 2 +- src/query/service/src/spillers/mod.rs | 4 +- src/query/service/src/spillers/spiller.rs | 86 +++++++++++++------ 6 files changed, 91 insertions(+), 91 deletions(-) diff --git a/src/query/service/src/pipelines/builders/builder_window.rs b/src/query/service/src/pipelines/builders/builder_window.rs index d33f74aebbd1..47f12c491d9c 100644 --- a/src/query/service/src/pipelines/builders/builder_window.rs +++ b/src/query/service/src/pipelines/builders/builder_window.rs @@ -25,13 +25,13 @@ use databend_common_pipeline_core::processors::ProcessorPtr; use databend_common_pipeline_core::query_spill_prefix; use databend_common_pipeline_core::Pipe; use databend_common_pipeline_core::PipeItem; +use databend_common_pipeline_core::Pipeline; use databend_common_sql::executor::physical_plans::Window; use databend_common_sql::executor::physical_plans::WindowPartition; use databend_common_storage::DataOperator; use databend_storages_common_cache::CacheManager; use tokio::sync::Semaphore; -use crate::pipelines::processors::transforms::DiskSpillConfig; use crate::pipelines::processors::transforms::FrameBound; use crate::pipelines::processors::transforms::TransformWindowPartitionBucket; use crate::pipelines::processors::transforms::TransformWindowPartitionScatter; @@ -41,6 +41,7 @@ use crate::pipelines::processors::transforms::TransformWindowPartitionSpillWrite use crate::pipelines::processors::transforms::WindowFunctionInfo; use crate::pipelines::processors::TransformWindow; use crate::pipelines::PipelineBuilder; +use crate::spillers::DiskSpillConfig; impl PipelineBuilder { pub(crate) fn build_window(&mut self, window: &Window) -> Result<()> { @@ -206,20 +207,7 @@ impl PipelineBuilder { )) })?; - let input_nums = self.main_pipeline.output_len(); - let transform = TransformWindowPartitionBucket::create(input_nums)?; - - let inputs = transform.get_inputs(); - let output = transform.get_output(); - - self.main_pipeline - .add_pipe(Pipe::create(inputs.len(), 1, vec![PipeItem::create( - ProcessorPtr::create(Box::new(transform)), - inputs, - vec![output], - )])); - - self.main_pipeline.try_resize(input_nums)?; + add_partition_bucket(&mut self.main_pipeline)?; let max_spill_io_requests = self.settings.get_max_spill_io_requests()? as usize; let semaphore = Arc::new(Semaphore::new(max_spill_io_requests)); @@ -255,3 +243,19 @@ impl PipelineBuilder { Ok(()) } } + +fn add_partition_bucket(pipeline: &mut Pipeline) -> Result<()> { + let input_nums = pipeline.output_len(); + let transform = TransformWindowPartitionBucket::create(input_nums)?; + + let inputs = transform.get_inputs(); + let output = transform.get_output(); + + pipeline.add_pipe(Pipe::create(inputs.len(), 1, vec![PipeItem::create( + ProcessorPtr::create(Box::new(transform)), + inputs, + vec![output], + )])); + + pipeline.try_resize(input_nums) +} diff --git a/src/query/service/src/pipelines/processors/transforms/window/partition_by/transform_window_partition_spill_reader.rs b/src/query/service/src/pipelines/processors/transforms/window/partition_by/transform_window_partition_spill_reader.rs index 97dbf742885b..6d36634b55fd 100644 --- a/src/query/service/src/pipelines/processors/transforms/window/partition_by/transform_window_partition_spill_reader.rs +++ b/src/query/service/src/pipelines/processors/transforms/window/partition_by/transform_window_partition_spill_reader.rs @@ -23,7 +23,6 @@ use databend_common_base::runtime::profile::ProfileStatisticsName; use databend_common_cache::dma_read_file_range; use databend_common_exception::ErrorCode; use databend_common_exception::Result; -use databend_common_expression::arrow::deserialize_column; use databend_common_expression::BlockMetaInfoDowncast; use databend_common_expression::BlockMetaInfoPtr; use databend_common_expression::DataBlock; @@ -41,6 +40,7 @@ use super::Location; use super::Partitioned; use super::WindowPartitionMeta; use super::WindowPayload; +use crate::spillers::deserialize_block; pub struct TransformWindowPartitionSpillReader { input: Arc, @@ -244,16 +244,3 @@ impl TransformWindowPartitionSpillReader { Ok(blocks) } } - -pub fn deserialize_block(columns_layout: &[u64], mut data: &[u8]) -> DataBlock { - let columns = columns_layout - .iter() - .map(|layout| { - let (cur, remain) = data.split_at(*layout as usize); - data = remain; - deserialize_column(cur).unwrap() - }) - .collect::>(); - - DataBlock::new_from_columns(columns) -} diff --git a/src/query/service/src/pipelines/processors/transforms/window/partition_by/transform_window_partition_spill_writer.rs b/src/query/service/src/pipelines/processors/transforms/window/partition_by/transform_window_partition_spill_writer.rs index 5bc03e647866..9e6b916a6d49 100644 --- a/src/query/service/src/pipelines/processors/transforms/window/partition_by/transform_window_partition_spill_writer.rs +++ b/src/query/service/src/pipelines/processors/transforms/window/partition_by/transform_window_partition_spill_writer.rs @@ -15,7 +15,6 @@ use std::any::Any; use std::io; use std::path::Path; -use std::path::PathBuf; use std::sync::Arc; use std::time::Instant; @@ -27,7 +26,6 @@ use databend_common_cache::dma_write_file_vectored; use databend_common_catalog::table_context::TableContext; use databend_common_exception::ErrorCode; use databend_common_exception::Result; -use databend_common_expression::arrow::serialize_column; use databend_common_expression::BlockMetaInfoDowncast; use databend_common_expression::DataBlock; use databend_common_pipeline_core::processors::Event; @@ -45,6 +43,8 @@ use super::WindowPartitionMeta; use crate::pipelines::processors::transforms::window::partition_by::SpillingWindowPayloads; use crate::pipelines::processors::transforms::window::partition_by::PARTITION_COUNT; use crate::sessions::QueryContext; +use crate::spillers::DiskSpillConfig; +use crate::spillers::EncodedBlock; pub struct TransformWindowPartitionSpillWriter { ctx: Arc, @@ -82,12 +82,6 @@ impl TransformWindowPartitionSpillWriter { } } -#[derive(Clone)] -pub struct DiskSpillConfig { - pub root: PathBuf, - pub bytes_limit: usize, -} - #[async_trait::async_trait] impl Processor for TransformWindowPartitionSpillWriter { fn name(&self) -> String { @@ -213,26 +207,13 @@ pub fn spilling_window_payload( } rows += block.num_rows(); - let columns_data = block - .columns() - .iter() - .map(|entry| { - let column = entry - .value - .convert_to_full_column(&entry.data_type, block.num_rows()); - serialize_column(&column) - }) - .collect::>(); - - let columns_layout = columns_data - .iter() - .map(|data| data.len() as u64) - .collect::>(); - - write_data.push(columns_data); + let encoded = EncodedBlock::from_block(&block); let begin = write_size; - write_size += columns_layout.iter().sum::(); + write_size += encoded.size() as u64; + let columns_layout = encoded.columns_layout(); + + write_data.push(encoded); Some((bucket, columns_layout, begin..write_size)) }) @@ -304,12 +285,12 @@ pub fn spilling_window_payload( async fn write_to_storage( operator: &Operator, path: &str, - write_data: Vec>>, + write_data: Vec, ) -> Result { let mut writer = operator.writer_with(path).chunk(8 * 1024 * 1024).await?; let mut written = 0; - for data in write_data.into_iter().flatten() { + for data in write_data.into_iter().flat_map(|x| x.0) { written += data.len(); writer.write(data).await?; } @@ -318,10 +299,10 @@ async fn write_to_storage( Ok(written) } -async fn write_to_disk(path: impl AsRef, write_data: Vec>>) -> io::Result { +async fn write_to_disk(path: impl AsRef, write_data: Vec) -> io::Result { let bufs = write_data .iter() - .flatten() + .flat_map(|x| &x.0) .map(|data| io::IoSlice::new(data)) .collect::>(); diff --git a/src/query/service/src/pipelines/processors/transforms/window/partition_by/window_partition_meta.rs b/src/query/service/src/pipelines/processors/transforms/window/partition_by/window_partition_meta.rs index 2de287e17425..ee980a7e0002 100644 --- a/src/query/service/src/pipelines/processors/transforms/window/partition_by/window_partition_meta.rs +++ b/src/query/service/src/pipelines/processors/transforms/window/partition_by/window_partition_meta.rs @@ -36,7 +36,7 @@ pub struct BucketSpilledWindowPayload { pub bucket: isize, pub location: Location, pub data_range: Range, - pub columns_layout: Vec, + pub columns_layout: Vec, } #[derive(Debug, Clone)] diff --git a/src/query/service/src/spillers/mod.rs b/src/query/service/src/spillers/mod.rs index aa3b57e83ddd..8317cff20ed9 100644 --- a/src/query/service/src/spillers/mod.rs +++ b/src/query/service/src/spillers/mod.rs @@ -15,7 +15,5 @@ mod spiller; mod spiller_buffer; -pub use spiller::Spiller; -pub use spiller::SpillerConfig; -pub use spiller::SpillerType; +pub use spiller::*; pub use spiller_buffer::SpillBuffer; diff --git a/src/query/service/src/spillers/spiller.rs b/src/query/service/src/spillers/spiller.rs index 164e6a81820c..84c6c3a3d019 100644 --- a/src/query/service/src/spillers/spiller.rs +++ b/src/query/service/src/spillers/spiller.rs @@ -16,6 +16,7 @@ use std::collections::HashMap; use std::collections::HashSet; use std::fmt::Display; use std::fmt::Formatter; +use std::path::PathBuf; use std::sync::Arc; use std::time::Instant; @@ -114,18 +115,12 @@ impl Spiller { /// We should guarantee that the file is managed by this spiller. pub async fn read_spilled_file(&self, file: &str) -> Result { debug_assert!(self.columns_layout.contains_key(file)); + let instant = Instant::now(); + let data = self.operator.read(file).await?.to_bytes(); let bytes = data.len(); - - let mut begin = 0; - let instant = Instant::now(); - let mut columns = Vec::with_capacity(self.columns_layout.len()); let columns_layout = self.columns_layout.get(file).unwrap(); - for column_layout in columns_layout.iter() { - columns.push(deserialize_column(&data[begin..begin + column_layout]).unwrap()); - begin += column_layout; - } - let block = DataBlock::new_from_columns(columns); + let block = deserialize_block(columns_layout, &data); Profile::record_usize_profile(ProfileStatisticsName::SpillReadCount, 1); Profile::record_usize_profile(ProfileStatisticsName::SpillReadBytes, bytes); @@ -138,35 +133,25 @@ impl Spiller { } /// Write a [`DataBlock`] to storage. - pub async fn spill_block(&mut self, data: DataBlock) -> Result { + pub async fn spill_block(&mut self, block: DataBlock) -> Result { let instant = Instant::now(); let unique_name = GlobalUniqName::unique(); let location = format!("{}/{}", self.config.location_prefix, unique_name); - let mut write_bytes = 0; + + let encoded = EncodedBlock::from_block(&block); + let columns_layout = encoded.columns_layout(); + let write_bytes = encoded.size(); + + self.columns_layout + .insert(location.to_string(), columns_layout); let mut writer = self .operator .writer_with(&location) .chunk(8 * 1024 * 1024) .await?; - let columns = data.columns().to_vec(); - let mut columns_data = Vec::with_capacity(columns.len()); - for column in columns.into_iter() { - let column = column - .value - .convert_to_full_column(&column.data_type, data.num_rows()); - let column_data = serialize_column(&column); - self.columns_layout - .entry(location.to_string()) - .and_modify(|layouts| { - layouts.push(column_data.len()); - }) - .or_insert(vec![column_data.len()]); - write_bytes += column_data.len(); - columns_data.push(column_data); - } - for data in columns_data.into_iter() { + for data in encoded.0.into_iter() { writer.write(data).await?; } writer.close().await?; @@ -229,3 +214,48 @@ impl Spiller { self.columns_layout.keys().cloned().collect() } } + +pub struct EncodedBlock(pub Vec>); + +impl EncodedBlock { + pub fn from_block(block: &DataBlock) -> Self { + let data = block + .columns() + .iter() + .map(|entry| { + let column = entry + .value + .convert_to_full_column(&entry.data_type, block.num_rows()); + serialize_column(&column) + }) + .collect(); + EncodedBlock(data) + } + + pub fn columns_layout(&self) -> Vec { + self.0.iter().map(|data| data.len()).collect() + } + + pub fn size(&self) -> usize { + self.0.iter().map(|data| data.len()).sum() + } +} + +pub fn deserialize_block(columns_layout: &[usize], mut data: &[u8]) -> DataBlock { + let columns = columns_layout + .iter() + .map(|layout| { + let (cur, remain) = data.split_at(*layout); + data = remain; + deserialize_column(cur).unwrap() + }) + .collect::>(); + + DataBlock::new_from_columns(columns) +} + +#[derive(Clone)] +pub struct DiskSpillConfig { + pub root: PathBuf, + pub bytes_limit: usize, +} From f41a03873e56594a2b232ab0159938b1ce2e4564 Mon Sep 17 00:00:00 2001 From: coldWater Date: Fri, 13 Sep 2024 10:12:44 +0800 Subject: [PATCH 07/40] clear temp dir Signed-off-by: coldWater --- .../service/src/interpreters/hook/vacuum_hook.rs | 11 +++++++++++ src/query/service/src/interpreters/interpreter.rs | 11 +++++++---- 2 files changed, 18 insertions(+), 4 deletions(-) diff --git a/src/query/service/src/interpreters/hook/vacuum_hook.rs b/src/query/service/src/interpreters/hook/vacuum_hook.rs index 4357027826fc..53be979a242a 100644 --- a/src/query/service/src/interpreters/hook/vacuum_hook.rs +++ b/src/query/service/src/interpreters/hook/vacuum_hook.rs @@ -23,6 +23,7 @@ use databend_common_license::license_manager::get_license_manager; use databend_common_pipeline_core::query_spill_prefix; use databend_common_storage::DataOperator; use databend_enterprise_vacuum_handler::get_vacuum_handler; +use databend_storages_common_cache::CacheManager; use opendal::Buffer; use crate::sessions::QueryContext; @@ -67,3 +68,13 @@ pub fn hook_vacuum_temp_files(query_ctx: &Arc) -> Result<()> { Ok(()) } + +pub fn hook_disk_temp_dir(query_ctx: &Arc) -> Result<()> { + Ok(CacheManager::instance() + .get_temp_dir_config() + .map(|cfg| { + let root = cfg.path.join(query_ctx.get_id()); + std::fs::remove_dir_all(root) + }) + .unwrap_or(Ok(()))?) +} diff --git a/src/query/service/src/interpreters/interpreter.rs b/src/query/service/src/interpreters/interpreter.rs index 1792b7fbe0ea..699098c30119 100644 --- a/src/query/service/src/interpreters/interpreter.rs +++ b/src/query/service/src/interpreters/interpreter.rs @@ -44,10 +44,11 @@ use log::info; use md5::Digest; use md5::Md5; -use crate::interpreters::hook::vacuum_hook::hook_vacuum_temp_files; -use crate::interpreters::interpreter_txn_commit::CommitInterpreter; -use crate::interpreters::InterpreterMetrics; -use crate::interpreters::InterpreterQueryLog; +use super::hook::vacuum_hook::hook_disk_temp_dir; +use super::hook::vacuum_hook::hook_vacuum_temp_files; +use super::interpreter_txn_commit::CommitInterpreter; +use super::InterpreterMetrics; +use super::InterpreterQueryLog; use crate::pipelines::executor::ExecutorSettings; use crate::pipelines::executor::PipelineCompleteExecutor; use crate::pipelines::executor::PipelinePullingExecutor; @@ -289,6 +290,8 @@ pub fn on_execution_finished(info: &ExecutionInfo, query_ctx: Arc) hook_vacuum_temp_files(&query_ctx)?; + hook_disk_temp_dir(&query_ctx)?; + let err_opt = match &info.res { Ok(_) => None, Err(e) => Some(e.clone()), From 11f3f70246c2f6e0840067d08cb9038823422e50 Mon Sep 17 00:00:00 2001 From: coldWater Date: Fri, 13 Sep 2024 11:34:34 +0800 Subject: [PATCH 08/40] move dma Signed-off-by: coldWater --- Cargo.lock | 3 --- .../{cache/src/disk => base/src/base}/dma.rs | 3 ++- src/common/base/src/base/mod.rs | 4 ++++ src/common/cache/Cargo.toml | 3 --- src/common/cache/src/disk.rs | 15 --------------- src/common/cache/src/lib.rs | 4 ---- .../transform_window_partition_spill_reader.rs | 2 +- .../transform_window_partition_spill_writer.rs | 2 +- 8 files changed, 8 insertions(+), 28 deletions(-) rename src/common/{cache/src/disk => base/src/base}/dma.rs (99%) delete mode 100644 src/common/cache/src/disk.rs diff --git a/Cargo.lock b/Cargo.lock index 5f34ab7c644b..12ae34b87c44 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3237,11 +3237,8 @@ dependencies = [ name = "databend-common-cache" version = "0.1.0" dependencies = [ - "databend-common-base", "hashbrown 0.14.5", "hashlink 0.8.4", - "libc", - "tokio", ] [[package]] diff --git a/src/common/cache/src/disk/dma.rs b/src/common/base/src/base/dma.rs similarity index 99% rename from src/common/cache/src/disk/dma.rs rename to src/common/base/src/base/dma.rs index 9ce0744cd829..6e0d3284c7df 100644 --- a/src/common/cache/src/disk/dma.rs +++ b/src/common/base/src/base/dma.rs @@ -24,10 +24,11 @@ use std::ops::Range; use std::os::unix::io::AsRawFd; use std::path::Path; -use databend_common_base::runtime::spawn_blocking; use tokio::fs::File; use tokio::io::AsyncSeekExt; +use crate::runtime::spawn_blocking; + /// An aligned buffer used to perform io on a `DmaFile`. #[derive(Debug)] pub struct DmaBuffer { diff --git a/src/common/base/src/base/mod.rs b/src/common/base/src/base/mod.rs index 175730845145..4ba645c70c31 100644 --- a/src/common/base/src/base/mod.rs +++ b/src/common/base/src/base/mod.rs @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +mod dma; mod net; mod ordered_float; mod profiling; @@ -27,6 +28,9 @@ mod take_mut; mod uniq_id; mod watch_notify; +pub use dma::dma_read_file; +pub use dma::dma_read_file_range; +pub use dma::dma_write_file_vectored; pub use net::get_free_tcp_port; pub use net::get_free_udp_port; pub use ordered_float::OrderedFloat; diff --git a/src/common/cache/Cargo.toml b/src/common/cache/Cargo.toml index 793982f45d8c..cfd7bccd6229 100644 --- a/src/common/cache/Cargo.toml +++ b/src/common/cache/Cargo.toml @@ -11,11 +11,8 @@ doctest = false test = true [dependencies] -databend-common-base = { workspace = true } hashbrown = { workspace = true } hashlink = "0.8" -libc = "0.2.158" -tokio = { workspace = true } [dev-dependencies] diff --git a/src/common/cache/src/disk.rs b/src/common/cache/src/disk.rs deleted file mode 100644 index 740c34018d60..000000000000 --- a/src/common/cache/src/disk.rs +++ /dev/null @@ -1,15 +0,0 @@ -// Copyright 2021 Datafuse Labs -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -pub mod dma; diff --git a/src/common/cache/src/lib.rs b/src/common/cache/src/lib.rs index 6f2de81950f0..78b060e9963d 100644 --- a/src/common/cache/src/lib.rs +++ b/src/common/cache/src/lib.rs @@ -16,13 +16,9 @@ #![allow(clippy::uninlined_format_args)] mod cache; -mod disk; mod mem_sized; pub use cache::lru::LruCache; pub use cache::Cache; -pub use disk::dma::dma_read_file; -pub use disk::dma::dma_read_file_range; -pub use disk::dma::dma_write_file_vectored; pub use hashbrown::hash_map::DefaultHashBuilder; pub use mem_sized::MemSized; diff --git a/src/query/service/src/pipelines/processors/transforms/window/partition_by/transform_window_partition_spill_reader.rs b/src/query/service/src/pipelines/processors/transforms/window/partition_by/transform_window_partition_spill_reader.rs index 6d36634b55fd..39427d3a22a9 100644 --- a/src/query/service/src/pipelines/processors/transforms/window/partition_by/transform_window_partition_spill_reader.rs +++ b/src/query/service/src/pipelines/processors/transforms/window/partition_by/transform_window_partition_spill_reader.rs @@ -18,9 +18,9 @@ use std::sync::Arc; use std::time::Duration; use std::time::Instant; +use databend_common_base::base::dma_read_file_range; use databend_common_base::runtime::profile::Profile; use databend_common_base::runtime::profile::ProfileStatisticsName; -use databend_common_cache::dma_read_file_range; use databend_common_exception::ErrorCode; use databend_common_exception::Result; use databend_common_expression::BlockMetaInfoDowncast; diff --git a/src/query/service/src/pipelines/processors/transforms/window/partition_by/transform_window_partition_spill_writer.rs b/src/query/service/src/pipelines/processors/transforms/window/partition_by/transform_window_partition_spill_writer.rs index 9e6b916a6d49..adb51e3324b2 100644 --- a/src/query/service/src/pipelines/processors/transforms/window/partition_by/transform_window_partition_spill_writer.rs +++ b/src/query/service/src/pipelines/processors/transforms/window/partition_by/transform_window_partition_spill_writer.rs @@ -18,11 +18,11 @@ use std::path::Path; use std::sync::Arc; use std::time::Instant; +use databend_common_base::base::dma_write_file_vectored; use databend_common_base::base::GlobalUniqName; use databend_common_base::base::ProgressValues; use databend_common_base::runtime::profile::Profile; use databend_common_base::runtime::profile::ProfileStatisticsName; -use databend_common_cache::dma_write_file_vectored; use databend_common_catalog::table_context::TableContext; use databend_common_exception::ErrorCode; use databend_common_exception::Result; From c77dbcd616a2f792fe656919e3471bd19ea48166 Mon Sep 17 00:00:00 2001 From: coldWater Date: Fri, 13 Sep 2024 15:33:20 +0800 Subject: [PATCH 09/40] config Signed-off-by: coldWater --- src/query/config/src/config.rs | 36 +++++++++++ src/query/config/src/inner.rs | 17 ++++++ src/query/config/src/lib.rs | 1 + src/query/config/src/mask.rs | 1 + src/query/service/src/global_services.rs | 5 ++ .../src/interpreters/hook/vacuum_hook.rs | 6 +- .../src/pipelines/builders/builder_window.rs | 19 +++--- ...transform_window_partition_spill_writer.rs | 19 +++--- src/query/service/src/spillers/spiller.rs | 24 +++++++- src/query/storages/common/cache/src/lib.rs | 5 +- .../storages/common/cache/src/manager.rs | 28 +-------- .../storages/common/cache/src/temp_dir.rs | 61 +++++++++++++++++++ .../storages/common/cache/src/temp_file.rs | 30 --------- 13 files changed, 171 insertions(+), 81 deletions(-) create mode 100644 src/query/storages/common/cache/src/temp_dir.rs delete mode 100644 src/query/storages/common/cache/src/temp_file.rs diff --git a/src/query/config/src/config.rs b/src/query/config/src/config.rs index 767bdb255b22..fdaac90bfbc0 100644 --- a/src/query/config/src/config.rs +++ b/src/query/config/src/config.rs @@ -132,6 +132,10 @@ pub struct Config { #[clap(flatten)] pub cache: CacheConfig, + // spill Config + #[clap(flatten)] + pub spill: SpillConfig, + // background configs #[clap(flatten)] pub background: BackgroundConfig, @@ -2930,6 +2934,18 @@ pub struct DiskCacheConfig { pub sync_data: bool, } +#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize, Args, Default)] +#[serde(default, deny_unknown_fields)] +pub struct SpillConfig { + /// Path of spill to local disk. disable if it's empty. + #[clap( + long, + value_name = "VALUE", + default_value = "./.databend/temp/_query_spill" + )] + pub spill_local_disk_path: String, +} + mod cache_config_converters { use log::warn; @@ -2953,6 +2969,7 @@ mod cache_config_converters { .map(|(k, v)| (k, v.into())) .collect(), cache: inner.cache.into(), + spill: inner.spill.into(), background: inner.background.into(), } } @@ -2985,6 +3002,7 @@ mod cache_config_converters { storage: self.storage.try_into()?, catalogs, cache: self.cache.try_into()?, + spill: self.spill.try_into()?, background: self.background.try_into()?, }) } @@ -3047,6 +3065,24 @@ mod cache_config_converters { } } + impl TryFrom for inner::SpillConfig { + type Error = ErrorCode; + + fn try_from(value: SpillConfig) -> std::result::Result { + Ok(Self { + path: value.spill_local_disk_path, + }) + } + } + + impl From for SpillConfig { + fn from(value: inner::SpillConfig) -> Self { + Self { + spill_local_disk_path: value.path, + } + } + } + impl TryFrom for inner::DiskCacheConfig { type Error = ErrorCode; fn try_from(value: DiskCacheConfig) -> std::result::Result { diff --git a/src/query/config/src/inner.rs b/src/query/config/src/inner.rs index cf6dc8fb847f..050347314884 100644 --- a/src/query/config/src/inner.rs +++ b/src/query/config/src/inner.rs @@ -64,6 +64,9 @@ pub struct InnerConfig { // Cache Config pub cache: CacheConfig, + // Spill Config + pub spill: SpillConfig, + // Background Config pub background: InnerBackgroundConfig, } @@ -701,3 +704,17 @@ impl Default for CacheConfig { } } } + +#[derive(Clone, Debug, PartialEq, Eq)] +pub struct SpillConfig { + /// Path of spill to local disk. disable if it's empty. + pub path: String, +} + +impl Default for SpillConfig { + fn default() -> Self { + Self { + path: "./.databend/temp/_query_spill".to_string(), + } + } +} diff --git a/src/query/config/src/lib.rs b/src/query/config/src/lib.rs index 269241a72bed..c6a9313447c9 100644 --- a/src/query/config/src/lib.rs +++ b/src/query/config/src/lib.rs @@ -49,6 +49,7 @@ pub use inner::CatalogConfig; pub use inner::CatalogHiveConfig; pub use inner::DiskCacheKeyReloadPolicy; pub use inner::InnerConfig; +pub use inner::SpillConfig; pub use inner::ThriftProtocol; pub use version::DATABEND_COMMIT_VERSION; pub use version::QUERY_GIT_SEMVER; diff --git a/src/query/config/src/mask.rs b/src/query/config/src/mask.rs index 37fae7279aac..62a5086b2c52 100644 --- a/src/query/config/src/mask.rs +++ b/src/query/config/src/mask.rs @@ -51,6 +51,7 @@ impl Config { storage: self.storage.mask_display(), catalog: self.catalog, cache: self.cache, + spill: self.spill, background: self.background, catalogs: self.catalogs, } diff --git a/src/query/service/src/global_services.rs b/src/query/service/src/global_services.rs index c9f91b8818ef..a6194387c3b3 100644 --- a/src/query/service/src/global_services.rs +++ b/src/query/service/src/global_services.rs @@ -35,6 +35,7 @@ use databend_common_users::builtin::BuiltIn; use databend_common_users::RoleCacheManager; use databend_common_users::UserApiProvider; use databend_storages_common_cache::CacheManager; +use databend_storages_common_cache::TempDirManager; use crate::auth::AuthMgr; use crate::builtin::BuiltinUDFs; @@ -146,6 +147,10 @@ impl GlobalServices { &config.query.max_server_memory_usage, config.query.tenant_id.tenant_name().to_string(), )?; + TempDirManager::init( + &config.spill, + config.query.tenant_id.tenant_name().to_string(), + )?; if let Some(addr) = config.query.cloud_control_grpc_server_address.clone() { CloudControlApiProvider::init(addr, config.query.cloud_control_grpc_timeout).await?; diff --git a/src/query/service/src/interpreters/hook/vacuum_hook.rs b/src/query/service/src/interpreters/hook/vacuum_hook.rs index 53be979a242a..7e4f72c6b51a 100644 --- a/src/query/service/src/interpreters/hook/vacuum_hook.rs +++ b/src/query/service/src/interpreters/hook/vacuum_hook.rs @@ -23,7 +23,7 @@ use databend_common_license::license_manager::get_license_manager; use databend_common_pipeline_core::query_spill_prefix; use databend_common_storage::DataOperator; use databend_enterprise_vacuum_handler::get_vacuum_handler; -use databend_storages_common_cache::CacheManager; +use databend_storages_common_cache::TempDirManager; use opendal::Buffer; use crate::sessions::QueryContext; @@ -70,8 +70,8 @@ pub fn hook_vacuum_temp_files(query_ctx: &Arc) -> Result<()> { } pub fn hook_disk_temp_dir(query_ctx: &Arc) -> Result<()> { - Ok(CacheManager::instance() - .get_temp_dir_config() + Ok(TempDirManager::instance() + .get_disk_spill_config() .map(|cfg| { let root = cfg.path.join(query_ctx.get_id()); std::fs::remove_dir_all(root) diff --git a/src/query/service/src/pipelines/builders/builder_window.rs b/src/query/service/src/pipelines/builders/builder_window.rs index 47f12c491d9c..437eac113689 100644 --- a/src/query/service/src/pipelines/builders/builder_window.rs +++ b/src/query/service/src/pipelines/builders/builder_window.rs @@ -29,7 +29,7 @@ use databend_common_pipeline_core::Pipeline; use databend_common_sql::executor::physical_plans::Window; use databend_common_sql::executor::physical_plans::WindowPartition; use databend_common_storage::DataOperator; -use databend_storages_common_cache::CacheManager; +use databend_storages_common_cache::TempDirManager; use tokio::sync::Semaphore; use crate::pipelines::processors::transforms::FrameBound; @@ -41,7 +41,7 @@ use crate::pipelines::processors::transforms::TransformWindowPartitionSpillWrite use crate::pipelines::processors::transforms::WindowFunctionInfo; use crate::pipelines::processors::TransformWindow; use crate::pipelines::PipelineBuilder; -use crate::spillers::DiskSpillConfig; +use crate::spillers::DiskSpill; impl PipelineBuilder { pub(crate) fn build_window(&mut self, window: &Window) -> Result<()> { @@ -186,13 +186,14 @@ impl PipelineBuilder { let location_prefix = query_spill_prefix(self.ctx.get_tenant().tenant_name(), &self.ctx.get_id()); - let disk_spill = - CacheManager::instance() - .get_temp_dir_config() - .map(|cfg| DiskSpillConfig { - root: cfg.path.join(self.ctx.get_id()), - bytes_limit: 1 << 20, // todo - }); + let disk_spill = match TempDirManager::instance().get_disk_spill_config() { + None => None, + Some(cfg) => { + let root = cfg.path.join(self.ctx.get_id()); + std::fs::create_dir(&root)?; + Some(DiskSpill::new(root, 5 << 20)) // todo + } + }; self.main_pipeline.add_transform(|input, output| { Ok(ProcessorPtr::create( diff --git a/src/query/service/src/pipelines/processors/transforms/window/partition_by/transform_window_partition_spill_writer.rs b/src/query/service/src/pipelines/processors/transforms/window/partition_by/transform_window_partition_spill_writer.rs index adb51e3324b2..6522f9e6e9d2 100644 --- a/src/query/service/src/pipelines/processors/transforms/window/partition_by/transform_window_partition_spill_writer.rs +++ b/src/query/service/src/pipelines/processors/transforms/window/partition_by/transform_window_partition_spill_writer.rs @@ -43,7 +43,7 @@ use super::WindowPartitionMeta; use crate::pipelines::processors::transforms::window::partition_by::SpillingWindowPayloads; use crate::pipelines::processors::transforms::window::partition_by::PARTITION_COUNT; use crate::sessions::QueryContext; -use crate::spillers::DiskSpillConfig; +use crate::spillers::DiskSpill; use crate::spillers::EncodedBlock; pub struct TransformWindowPartitionSpillWriter { @@ -53,7 +53,7 @@ pub struct TransformWindowPartitionSpillWriter { operator: Operator, location_prefix: String, - disk_spill: Option, + disk_spill: Option, spilled_block: Option, spilling_meta: Option, spilling_future: Option>>, @@ -65,7 +65,7 @@ impl TransformWindowPartitionSpillWriter { input: Arc, output: Arc, operator: Operator, - disk_spill: Option, + disk_spill: Option, location_prefix: String, ) -> Box { Box::new(TransformWindowPartitionSpillWriter { @@ -189,7 +189,7 @@ pub fn spilling_window_payload( ctx: Arc, operator: Operator, location_prefix: &str, - disk_spill: Option<&mut DiskSpillConfig>, + disk_spill: Option<&mut DiskSpill>, unique_name: String, payload: SpillingWindowPayloads, ) -> Result>> { @@ -219,11 +219,14 @@ pub fn spilling_window_payload( }) .collect::>(); - let location = match disk_spill { - Some(disk) if disk.bytes_limit as u64 >= write_size => { - disk.bytes_limit -= write_size as usize; - Location::Disk(disk.root.join(unique_name)) + let location = match disk_spill.map(|disk| { + if disk.try_write(write_size as isize) { + Some(Location::Disk(disk.root.join(unique_name.clone()))) + } else { + None } + }) { + Some(Some(x)) => x, _ => Location::Storage(format!("{location_prefix}/{unique_name}")), }; diff --git a/src/query/service/src/spillers/spiller.rs b/src/query/service/src/spillers/spiller.rs index 84c6c3a3d019..e6f60f99752a 100644 --- a/src/query/service/src/spillers/spiller.rs +++ b/src/query/service/src/spillers/spiller.rs @@ -18,6 +18,7 @@ use std::fmt::Display; use std::fmt::Formatter; use std::path::PathBuf; use std::sync::Arc; +use std::sync::Mutex; use std::time::Instant; use databend_common_base::base::GlobalUniqName; @@ -255,7 +256,26 @@ pub fn deserialize_block(columns_layout: &[usize], mut data: &[u8]) -> DataBlock } #[derive(Clone)] -pub struct DiskSpillConfig { +pub struct DiskSpill { pub root: PathBuf, - pub bytes_limit: usize, + pub bytes_limit: Arc>, +} + +impl DiskSpill { + pub fn new(root: PathBuf, limit: isize) -> DiskSpill { + DiskSpill { + root, + bytes_limit: Arc::new(Mutex::new(limit)), + } + } + + pub fn try_write(&mut self, size: isize) -> bool { + let mut guard = self.bytes_limit.lock().unwrap(); + if *guard > size { + *guard -= size; + true + } else { + false + } + } } diff --git a/src/query/storages/common/cache/src/lib.rs b/src/query/storages/common/cache/src/lib.rs index 84c1ba172f0f..b9be0ee5e59c 100644 --- a/src/query/storages/common/cache/src/lib.rs +++ b/src/query/storages/common/cache/src/lib.rs @@ -20,7 +20,7 @@ mod caches; mod manager; mod providers; mod read; -mod temp_file; +mod temp_dir; pub use cache::CacheAccessor; pub use cache::Unit; @@ -46,4 +46,5 @@ pub use read::InMemoryCacheReader; pub use read::InMemoryItemCacheReader; pub use read::LoadParams; pub use read::Loader; -pub use temp_file::TempDir; +pub use temp_dir::TempDir; +pub use temp_dir::TempDirManager; diff --git a/src/query/storages/common/cache/src/manager.rs b/src/query/storages/common/cache/src/manager.rs index d56345bca1c8..6553f64f30fd 100644 --- a/src/query/storages/common/cache/src/manager.rs +++ b/src/query/storages/common/cache/src/manager.rs @@ -37,7 +37,6 @@ use crate::caches::TableSnapshotStatisticCache; use crate::InMemoryLruCache; use crate::TableDataCache; use crate::TableDataCacheBuilder; -use crate::TempDir; static DEFAULT_FILE_META_DATA_CACHE_ITEMS: usize = 3000; @@ -55,7 +54,6 @@ pub struct CacheManager { table_data_cache: Option, in_memory_table_data_cache: Option, block_meta_cache: Option, - temp_dir_config: Option, } impl CacheManager { @@ -65,15 +63,13 @@ impl CacheManager { max_server_memory_usage: &u64, tenant_id: impl Into, ) -> Result<()> { - let tenant_id = tenant_id.into(); - // setup table data cache let table_data_cache = { match config.data_cache_storage { CacheStorageTypeInnerConfig::None => None, CacheStorageTypeInnerConfig::Disk => { let real_disk_cache_root = PathBuf::from(&config.disk_cache_config.path) - .join(tenant_id.clone()) + .join(tenant_id.into()) .join("v1"); let queue_size: u32 = if config.table_data_cache_population_queue_size > 0 { @@ -103,22 +99,6 @@ impl CacheManager { } }; - let temp_dir_config = match config.data_cache_storage { - CacheStorageTypeInnerConfig::None => None, - CacheStorageTypeInnerConfig::Disk => { - let path = PathBuf::from(&config.disk_cache_config.path) - .join("temp") - .join(tenant_id.clone()); - - let temp_dir = TempDir { - path, - bytes_limit: config.disk_cache_config.max_bytes as usize, - }; - temp_dir.init()?; - Some(temp_dir) - } - }; - // setup in-memory table column cache let memory_cache_capacity = if config.table_data_deserialized_data_bytes != 0 { config.table_data_deserialized_data_bytes as usize @@ -147,7 +127,6 @@ impl CacheManager { table_data_cache, in_memory_table_data_cache, block_meta_cache: None, - temp_dir_config, })); } else { let table_snapshot_cache = Self::new_named_items_cache( @@ -215,7 +194,6 @@ impl CacheManager { table_data_cache, in_memory_table_data_cache, block_meta_cache, - temp_dir_config, })); } @@ -274,10 +252,6 @@ impl CacheManager { self.in_memory_table_data_cache.clone() } - pub fn get_temp_dir_config(&self) -> Option { - self.temp_dir_config.clone() - } - pub fn new_named_items_cache>>( capacity: usize, name: impl Into, diff --git a/src/query/storages/common/cache/src/temp_dir.rs b/src/query/storages/common/cache/src/temp_dir.rs new file mode 100644 index 000000000000..fddadaf1fef9 --- /dev/null +++ b/src/query/storages/common/cache/src/temp_dir.rs @@ -0,0 +1,61 @@ +// Copyright 2021 Datafuse Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::path::PathBuf; +use std::sync::Arc; + +use databend_common_base::base::GlobalInstance; +use databend_common_config::SpillConfig; +use databend_common_exception::Result; + +pub struct TempDirManager { + disk_spill_config: Option, +} + +impl TempDirManager { + pub fn init(config: &SpillConfig, tenant_id: String) -> Result<()> { + let disk_spill_config = if config.path.is_empty() { + None + } else { + let path = PathBuf::from(&config.path).join(tenant_id.clone()); + + let temp_dir = TempDir { path }; + temp_dir.init()?; + Some(temp_dir) + }; + + GlobalInstance::set(Arc::new(Self { disk_spill_config })); + Ok(()) + } + + pub fn instance() -> Arc { + GlobalInstance::get() + } + + pub fn get_disk_spill_config(&self) -> Option { + self.disk_spill_config.clone() + } +} + +#[derive(Clone)] +pub struct TempDir { + pub path: PathBuf, +} + +impl TempDir { + fn init(&self) -> Result<()> { + let _ = std::fs::remove_dir_all(&self.path); + Ok(std::fs::create_dir_all(&self.path)?) + } +} diff --git a/src/query/storages/common/cache/src/temp_file.rs b/src/query/storages/common/cache/src/temp_file.rs deleted file mode 100644 index dfb4996a541c..000000000000 --- a/src/query/storages/common/cache/src/temp_file.rs +++ /dev/null @@ -1,30 +0,0 @@ -// Copyright 2021 Datafuse Labs -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::path::PathBuf; - -use databend_common_exception::Result; - -#[derive(Clone)] -pub struct TempDir { - pub path: PathBuf, - pub bytes_limit: usize, -} - -impl TempDir { - pub fn init(&self) -> Result<()> { - let _ = std::fs::remove_dir_all(&self.path); - Ok(std::fs::create_dir_all(&self.path)?) - } -} From 205a1de001fb333a7cf7404beca91891cf870df4 Mon Sep 17 00:00:00 2001 From: coldWater Date: Fri, 13 Sep 2024 16:36:53 +0800 Subject: [PATCH 10/40] fix Signed-off-by: coldWater --- Cargo.toml | 1 + src/common/base/Cargo.toml | 2 +- src/common/tracing/Cargo.toml | 2 +- .../src/interpreters/hook/vacuum_hook.rs | 18 ++++++----- .../src/pipelines/builders/builder_window.rs | 12 ++++---- ...transform_window_partition_spill_writer.rs | 30 +++++++++++-------- src/query/service/src/spillers/spiller.rs | 30 ++++++++++++++----- 7 files changed, 59 insertions(+), 36 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index cf0986912747..e5da097a613f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -247,6 +247,7 @@ http = "1" itertools = "0.10.5" jsonb = "0.4.1" jwt-simple = "0.11.0" +libc = { version = "0.2.158" } match-template = "0.0.1" mysql_async = { version = "0.34", default-features = false, features = ["native-tls-tls"] } object_store_opendal = "0.46" diff --git a/src/common/base/Cargo.toml b/src/common/base/Cargo.toml index 62a88b3efce1..b8934c96d1bf 100644 --- a/src/common/base/Cargo.toml +++ b/src/common/base/Cargo.toml @@ -31,7 +31,7 @@ databend-common-exception = { workspace = true } enquote = "1.1.0" fastrace = { workspace = true } futures = { workspace = true } -libc = "0.2.153" +libc = { workspace = true } log = { workspace = true } logcall = { workspace = true } micromarshal = "0.5.0" diff --git a/src/common/tracing/Cargo.toml b/src/common/tracing/Cargo.toml index 67f7a816cb21..95337f30f4ad 100644 --- a/src/common/tracing/Cargo.toml +++ b/src/common/tracing/Cargo.toml @@ -21,7 +21,7 @@ fastrace = { workspace = true } fastrace-opentelemetry = { workspace = true } fern = "0.6.2" itertools = { workspace = true } -libc = "0.2.153" +libc = { workspace = true } log = { workspace = true } opentelemetry = { workspace = true } opentelemetry-otlp = { workspace = true } diff --git a/src/query/service/src/interpreters/hook/vacuum_hook.rs b/src/query/service/src/interpreters/hook/vacuum_hook.rs index 7e4f72c6b51a..654b85bec3bd 100644 --- a/src/query/service/src/interpreters/hook/vacuum_hook.rs +++ b/src/query/service/src/interpreters/hook/vacuum_hook.rs @@ -12,6 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. +use std::fs::remove_dir_all; +use std::io::ErrorKind; use std::sync::Arc; use std::time::Duration; @@ -70,11 +72,13 @@ pub fn hook_vacuum_temp_files(query_ctx: &Arc) -> Result<()> { } pub fn hook_disk_temp_dir(query_ctx: &Arc) -> Result<()> { - Ok(TempDirManager::instance() - .get_disk_spill_config() - .map(|cfg| { - let root = cfg.path.join(query_ctx.get_id()); - std::fs::remove_dir_all(root) - }) - .unwrap_or(Ok(()))?) + if let Some(cfg) = TempDirManager::instance().get_disk_spill_config() { + let root = cfg.path.join(query_ctx.get_id()); + if let Err(e) = remove_dir_all(root) { + if !matches!(e.kind(), ErrorKind::NotFound) { + return Err(e.into()); + } + } + } + Ok(()) } diff --git a/src/query/service/src/pipelines/builders/builder_window.rs b/src/query/service/src/pipelines/builders/builder_window.rs index 437eac113689..2d53ecc19c25 100644 --- a/src/query/service/src/pipelines/builders/builder_window.rs +++ b/src/query/service/src/pipelines/builders/builder_window.rs @@ -186,14 +186,12 @@ impl PipelineBuilder { let location_prefix = query_spill_prefix(self.ctx.get_tenant().tenant_name(), &self.ctx.get_id()); - let disk_spill = match TempDirManager::instance().get_disk_spill_config() { - None => None, - Some(cfg) => { + let disk_spill = TempDirManager::instance() + .get_disk_spill_config() + .map(|cfg| { let root = cfg.path.join(self.ctx.get_id()); - std::fs::create_dir(&root)?; - Some(DiskSpill::new(root, 5 << 20)) // todo - } - }; + DiskSpill::new(root, 5 << 20) // todo + }); self.main_pipeline.add_transform(|input, output| { Ok(ProcessorPtr::create( diff --git a/src/query/service/src/pipelines/processors/transforms/window/partition_by/transform_window_partition_spill_writer.rs b/src/query/service/src/pipelines/processors/transforms/window/partition_by/transform_window_partition_spill_writer.rs index 6522f9e6e9d2..6e36d004665e 100644 --- a/src/query/service/src/pipelines/processors/transforms/window/partition_by/transform_window_partition_spill_writer.rs +++ b/src/query/service/src/pipelines/processors/transforms/window/partition_by/transform_window_partition_spill_writer.rs @@ -53,7 +53,7 @@ pub struct TransformWindowPartitionSpillWriter { operator: Operator, location_prefix: String, - disk_spill: Option, + disk_spill: Option>, spilled_block: Option, spilling_meta: Option, spilling_future: Option>>, @@ -65,7 +65,7 @@ impl TransformWindowPartitionSpillWriter { input: Arc, output: Arc, operator: Operator, - disk_spill: Option, + disk_spill: Option>, location_prefix: String, ) -> Box { Box::new(TransformWindowPartitionSpillWriter { @@ -156,7 +156,7 @@ impl Processor for TransformWindowPartitionSpillWriter { self.ctx.clone(), self.operator.clone(), &self.location_prefix, - self.disk_spill.as_mut(), + self.disk_spill.clone(), GlobalUniqName::unique(), payload, )?); @@ -189,7 +189,7 @@ pub fn spilling_window_payload( ctx: Arc, operator: Operator, location_prefix: &str, - disk_spill: Option<&mut DiskSpill>, + disk_spill: Option>, unique_name: String, payload: SpillingWindowPayloads, ) -> Result>> { @@ -219,16 +219,20 @@ pub fn spilling_window_payload( }) .collect::>(); - let location = match disk_spill.map(|disk| { - if disk.try_write(write_size as isize) { - Some(Location::Disk(disk.root.join(unique_name.clone()))) - } else { - None + let location = match disk_spill { + None => None, + Some(disk) => { + if disk.try_write(write_size as isize) { + disk.init()?; + Some(Location::Disk(disk.root.join(unique_name.clone()))) + } else { + None + } } - }) { - Some(Some(x)) => x, - _ => Location::Storage(format!("{location_prefix}/{unique_name}")), - }; + } + .unwrap_or(Location::Storage(format!( + "{location_prefix}/{unique_name}" + ))); let spilled_buckets_payloads = partitions .into_iter() diff --git a/src/query/service/src/spillers/spiller.rs b/src/query/service/src/spillers/spiller.rs index e6f60f99752a..31cd1353fcae 100644 --- a/src/query/service/src/spillers/spiller.rs +++ b/src/query/service/src/spillers/spiller.rs @@ -16,9 +16,12 @@ use std::collections::HashMap; use std::collections::HashSet; use std::fmt::Display; use std::fmt::Formatter; +use std::fs::create_dir; +use std::io::ErrorKind; use std::path::PathBuf; use std::sync::Arc; use std::sync::Mutex; +use std::sync::Once; use std::time::Instant; use databend_common_base::base::GlobalUniqName; @@ -255,21 +258,22 @@ pub fn deserialize_block(columns_layout: &[usize], mut data: &[u8]) -> DataBlock DataBlock::new_from_columns(columns) } -#[derive(Clone)] pub struct DiskSpill { pub root: PathBuf, - pub bytes_limit: Arc>, + pub bytes_limit: Mutex, + inited: Once, } impl DiskSpill { - pub fn new(root: PathBuf, limit: isize) -> DiskSpill { - DiskSpill { + pub fn new(root: PathBuf, limit: isize) -> Arc { + Arc::new(DiskSpill { root, - bytes_limit: Arc::new(Mutex::new(limit)), - } + bytes_limit: Mutex::new(limit), + inited: Once::new(), + }) } - pub fn try_write(&mut self, size: isize) -> bool { + pub fn try_write(&self, size: isize) -> bool { let mut guard = self.bytes_limit.lock().unwrap(); if *guard > size { *guard -= size; @@ -278,4 +282,16 @@ impl DiskSpill { false } } + + pub fn init(&self) -> Result<()> { + let mut rt = Ok(()); + self.inited.call_once(|| { + if let Err(e) = create_dir(&self.root) { + if !matches!(e.kind(), ErrorKind::AlreadyExists) { + rt = Err(e); + } + } + }); + Ok(rt?) + } } From cdf6d7842bf8360ed5645a4728d51f5256825c30 Mon Sep 17 00:00:00 2001 From: coldWater Date: Sat, 14 Sep 2024 12:43:15 +0800 Subject: [PATCH 11/40] bytes limit Signed-off-by: coldWater --- .../src/pipelines/builders/builder_window.rs | 9 ++- ...transform_window_partition_spill_writer.rs | 51 +++------------ src/query/service/src/spillers/spiller.rs | 65 +++++++++++++------ src/query/service/src/test_kits/fixture.rs | 3 +- src/query/settings/src/settings_default.rs | 6 ++ .../settings/src/settings_getter_setter.rs | 4 ++ .../storages/common/cache/src/temp_dir.rs | 12 +++- 7 files changed, 81 insertions(+), 69 deletions(-) diff --git a/src/query/service/src/pipelines/builders/builder_window.rs b/src/query/service/src/pipelines/builders/builder_window.rs index 2d53ecc19c25..fc016bb00785 100644 --- a/src/query/service/src/pipelines/builders/builder_window.rs +++ b/src/query/service/src/pipelines/builders/builder_window.rs @@ -186,11 +186,14 @@ impl PipelineBuilder { let location_prefix = query_spill_prefix(self.ctx.get_tenant().tenant_name(), &self.ctx.get_id()); + let disk_bytes_limit = self + .settings + .get_window_partition_spilling_to_disk_bytes_limit()?; let disk_spill = TempDirManager::instance() .get_disk_spill_config() .map(|cfg| { let root = cfg.path.join(self.ctx.get_id()); - DiskSpill::new(root, 5 << 20) // todo + DiskSpill::new(root, disk_bytes_limit as isize) }); self.main_pipeline.add_transform(|input, output| { @@ -220,8 +223,8 @@ impl PipelineBuilder { })?; let block_size = self.settings.get_max_block_size()? as usize; - let sort_spilling_batch_bytes = self.ctx.get_settings().get_sort_spilling_batch_bytes()?; - let enable_loser_tree = self.ctx.get_settings().get_enable_loser_tree_merge_sort()?; + let sort_spilling_batch_bytes = self.settings.get_sort_spilling_batch_bytes()?; + let enable_loser_tree = self.settings.get_enable_loser_tree_merge_sort()?; let have_order_col = window_partition.after_exchange.unwrap_or(false); self.main_pipeline.add_transform(|input, output| { diff --git a/src/query/service/src/pipelines/processors/transforms/window/partition_by/transform_window_partition_spill_writer.rs b/src/query/service/src/pipelines/processors/transforms/window/partition_by/transform_window_partition_spill_writer.rs index 6e36d004665e..de3db501c457 100644 --- a/src/query/service/src/pipelines/processors/transforms/window/partition_by/transform_window_partition_spill_writer.rs +++ b/src/query/service/src/pipelines/processors/transforms/window/partition_by/transform_window_partition_spill_writer.rs @@ -13,16 +13,11 @@ // limitations under the License. use std::any::Any; -use std::io; -use std::path::Path; use std::sync::Arc; use std::time::Instant; -use databend_common_base::base::dma_write_file_vectored; use databend_common_base::base::GlobalUniqName; use databend_common_base::base::ProgressValues; -use databend_common_base::runtime::profile::Profile; -use databend_common_base::runtime::profile::ProfileStatisticsName; use databend_common_catalog::table_context::TableContext; use databend_common_exception::ErrorCode; use databend_common_exception::Result; @@ -43,6 +38,9 @@ use super::WindowPartitionMeta; use crate::pipelines::processors::transforms::window::partition_by::SpillingWindowPayloads; use crate::pipelines::processors::transforms::window::partition_by::PARTITION_COUNT; use crate::sessions::QueryContext; +use crate::spillers::record_write_profile; +use crate::spillers::write_encodeds_to_disk; +use crate::spillers::write_encodes_to_storage; use crate::spillers::DiskSpill; use crate::spillers::EncodedBlock; @@ -253,20 +251,14 @@ pub fn spilling_window_payload( 0 } else { match &location { - Location::Storage(path) => write_to_storage(&operator, path, write_data).await?, - Location::Disk(path) => write_to_disk(path, write_data).await?, + Location::Storage(path) => { + write_encodes_to_storage(&operator, path, write_data).await? + } + Location::Disk(path) => write_encodeds_to_disk(path, write_data).await?, } }; - // perf - { - Profile::record_usize_profile(ProfileStatisticsName::SpillWriteCount, 1); - Profile::record_usize_profile(ProfileStatisticsName::SpillWriteBytes, write_bytes); - Profile::record_usize_profile( - ProfileStatisticsName::SpillWriteTime, - instant.elapsed().as_millis() as usize, - ); - } + record_write_profile(&instant, write_bytes); { let progress_val = ProgressValues { @@ -288,30 +280,3 @@ pub fn spilling_window_payload( }); Ok(future) } - -async fn write_to_storage( - operator: &Operator, - path: &str, - write_data: Vec, -) -> Result { - let mut writer = operator.writer_with(path).chunk(8 * 1024 * 1024).await?; - - let mut written = 0; - for data in write_data.into_iter().flat_map(|x| x.0) { - written += data.len(); - writer.write(data).await?; - } - - writer.close().await?; - Ok(written) -} - -async fn write_to_disk(path: impl AsRef, write_data: Vec) -> io::Result { - let bufs = write_data - .iter() - .flat_map(|x| &x.0) - .map(|data| io::IoSlice::new(data)) - .collect::>(); - - dma_write_file_vectored(path, &bufs).await -} diff --git a/src/query/service/src/spillers/spiller.rs b/src/query/service/src/spillers/spiller.rs index 31cd1353fcae..33d0540fbc21 100644 --- a/src/query/service/src/spillers/spiller.rs +++ b/src/query/service/src/spillers/spiller.rs @@ -17,13 +17,15 @@ use std::collections::HashSet; use std::fmt::Display; use std::fmt::Formatter; use std::fs::create_dir; -use std::io::ErrorKind; +use std::io; +use std::path::Path; use std::path::PathBuf; use std::sync::Arc; use std::sync::Mutex; use std::sync::Once; use std::time::Instant; +use databend_common_base::base::dma_write_file_vectored; use databend_common_base::base::GlobalUniqName; use databend_common_base::base::ProgressValues; use databend_common_base::runtime::profile::Profile; @@ -144,28 +146,14 @@ impl Spiller { let encoded = EncodedBlock::from_block(&block); let columns_layout = encoded.columns_layout(); - let write_bytes = encoded.size(); self.columns_layout .insert(location.to_string(), columns_layout); - let mut writer = self - .operator - .writer_with(&location) - .chunk(8 * 1024 * 1024) - .await?; + let write_bytes = + write_encodes_to_storage(&self.operator, &location, vec![encoded]).await?; - for data in encoded.0.into_iter() { - writer.write(data).await?; - } - writer.close().await?; - - Profile::record_usize_profile(ProfileStatisticsName::SpillWriteCount, 1); - Profile::record_usize_profile(ProfileStatisticsName::SpillWriteBytes, write_bytes); - Profile::record_usize_profile( - ProfileStatisticsName::SpillWriteTime, - instant.elapsed().as_millis() as usize, - ); + record_write_profile(&instant, write_bytes); Ok(location) } @@ -287,7 +275,7 @@ impl DiskSpill { let mut rt = Ok(()); self.inited.call_once(|| { if let Err(e) = create_dir(&self.root) { - if !matches!(e.kind(), ErrorKind::AlreadyExists) { + if !matches!(e.kind(), io::ErrorKind::AlreadyExists) { rt = Err(e); } } @@ -295,3 +283,42 @@ impl DiskSpill { Ok(rt?) } } + +pub async fn write_encodes_to_storage( + operator: &Operator, + path: &str, + write_data: Vec, +) -> Result { + let mut writer = operator.writer_with(path).chunk(8 * 1024 * 1024).await?; + + let mut written = 0; + for data in write_data.into_iter().flat_map(|x| x.0) { + written += data.len(); + writer.write(data).await?; + } + + writer.close().await?; + Ok(written) +} + +pub async fn write_encodeds_to_disk( + path: impl AsRef, + write_data: Vec, +) -> io::Result { + let bufs = write_data + .iter() + .flat_map(|x| &x.0) + .map(|data| io::IoSlice::new(data)) + .collect::>(); + + dma_write_file_vectored(path, &bufs).await +} + +pub fn record_write_profile(start: &Instant, write_bytes: usize) { + Profile::record_usize_profile(ProfileStatisticsName::SpillWriteCount, 1); + Profile::record_usize_profile(ProfileStatisticsName::SpillWriteBytes, write_bytes); + Profile::record_usize_profile( + ProfileStatisticsName::SpillWriteTime, + start.elapsed().as_millis() as usize, + ); +} diff --git a/src/query/service/src/test_kits/fixture.rs b/src/query/service/src/test_kits/fixture.rs index 6130bbbf65c9..71bc6161e60c 100644 --- a/src/query/service/src/test_kits/fixture.rs +++ b/src/query/service/src/test_kits/fixture.rs @@ -17,7 +17,6 @@ use std::str; use std::sync::Arc; use databend_common_ast::ast::Engine; -use databend_common_base::runtime::drop_guard; use databend_common_catalog::catalog_kind::CATALOG_DEFAULT; use databend_common_catalog::cluster_info::Cluster; use databend_common_catalog::table::AppendMode; @@ -115,7 +114,7 @@ impl Drop for TestGuard { fn drop(&mut self) { #[cfg(debug_assertions)] { - drop_guard(move || { + databend_common_base::runtime::drop_guard(move || { databend_common_base::base::GlobalInstance::drop_testing(&self._thread_name); }) } diff --git a/src/query/settings/src/settings_default.rs b/src/query/settings/src/settings_default.rs index 2c189bb5681f..628c2f371574 100644 --- a/src/query/settings/src/settings_default.rs +++ b/src/query/settings/src/settings_default.rs @@ -460,6 +460,12 @@ impl DefaultSettings { mode: SettingMode::Both, range: Some(SettingRange::Numeric(0..=100)), }), + ("window_partition_spilling_to_disk_bytes_limit", DefaultSettingValue { + value: UserSettingValue::UInt64(10<<30), + desc: "Sets the maximum amount of local disk in bytes that each window partitioner can use before spilling data to storage during query execution.", + mode: SettingMode::Both, + range: Some(SettingRange::Numeric(0..=u64::MAX)), + }), ("sort_spilling_bytes_threshold_per_proc", DefaultSettingValue { value: UserSettingValue::UInt64(0), diff --git a/src/query/settings/src/settings_getter_setter.rs b/src/query/settings/src/settings_getter_setter.rs index 866e56772072..7c6126f931ee 100644 --- a/src/query/settings/src/settings_getter_setter.rs +++ b/src/query/settings/src/settings_getter_setter.rs @@ -395,6 +395,10 @@ impl Settings { Ok(self.try_get_u64("window_partition_spilling_bytes_threshold_per_proc")? as usize) } + pub fn get_window_partition_spilling_to_disk_bytes_limit(&self) -> Result { + Ok(self.try_get_u64("window_partition_spilling_to_disk_bytes_limit")? as usize) + } + pub fn get_window_partition_spilling_memory_ratio(&self) -> Result { Ok(self.try_get_u64("window_partition_spilling_memory_ratio")? as usize) } diff --git a/src/query/storages/common/cache/src/temp_dir.rs b/src/query/storages/common/cache/src/temp_dir.rs index fddadaf1fef9..23c2b0649199 100644 --- a/src/query/storages/common/cache/src/temp_dir.rs +++ b/src/query/storages/common/cache/src/temp_dir.rs @@ -12,6 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. +use std::fs::create_dir_all; +use std::fs::remove_dir_all; +use std::io::ErrorKind; use std::path::PathBuf; use std::sync::Arc; @@ -55,7 +58,12 @@ pub struct TempDir { impl TempDir { fn init(&self) -> Result<()> { - let _ = std::fs::remove_dir_all(&self.path); - Ok(std::fs::create_dir_all(&self.path)?) + if let Err(e) = remove_dir_all(&self.path) { + if !matches!(e.kind(), ErrorKind::NotFound) { + Err(e)?; + } + } + + Ok(create_dir_all(&self.path)?) } } From 37a8be8be2bfd60f47d8f578dbf309765a1dee91 Mon Sep 17 00:00:00 2001 From: coldWater Date: Wed, 18 Sep 2024 15:28:23 +0800 Subject: [PATCH 12/40] rustix Signed-off-by: coldWater --- Cargo.lock | 47 +++++++++++++-------------- src/common/base/Cargo.toml | 1 + src/common/base/src/base/dma.rs | 56 +++++++++++++++------------------ 3 files changed, 50 insertions(+), 54 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 12ae34b87c44..f6a12f7e353d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -761,7 +761,7 @@ dependencies = [ "futures-lite 2.3.0", "parking", "polling 3.7.3", - "rustix 0.38.34", + "rustix 0.38.37", "slab", "tracing", "windows-sys 0.59.0", @@ -1914,7 +1914,7 @@ dependencies = [ "io-lifetimes 2.0.3", "ipnet", "maybe-owned", - "rustix 0.38.34", + "rustix 0.38.37", "windows-sys 0.52.0", "winx", ] @@ -1938,7 +1938,7 @@ dependencies = [ "cap-primitives", "io-extras", "io-lifetimes 2.0.3", - "rustix 0.38.34", + "rustix 0.38.37", ] [[package]] @@ -1951,7 +1951,7 @@ dependencies = [ "cap-primitives", "iana-time-zone", "once_cell", - "rustix 0.38.34", + "rustix 0.38.37", "winx", ] @@ -3209,6 +3209,7 @@ dependencies = [ "rand 0.8.5", "regex", "replace_with", + "rustix 0.38.37", "semver", "serde", "serde_json", @@ -6597,7 +6598,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7e5768da2206272c81ef0b5e951a41862938a6070da63bcea197899942d3b947" dependencies = [ "cfg-if 1.0.0", - "rustix 0.38.34", + "rustix 0.38.37", "windows-sys 0.52.0", ] @@ -6876,7 +6877,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "033b337d725b97690d86893f9de22b67b80dcc4e9ad815f348254c38119db8fb" dependencies = [ "io-lifetimes 2.0.3", - "rustix 0.38.34", + "rustix 0.38.37", "windows-sys 0.52.0", ] @@ -6896,7 +6897,7 @@ version = "0.8.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f7e180ac76c23b45e767bd7ae9579bc0bb458618c4bc71835926e098e61d15f8" dependencies = [ - "rustix 0.38.34", + "rustix 0.38.37", "windows-sys 0.52.0", ] @@ -7721,7 +7722,7 @@ dependencies = [ "itoa", "libc", "memmap2 0.9.4", - "rustix 0.38.34", + "rustix 0.38.37", "smallvec", "thiserror", ] @@ -7883,7 +7884,7 @@ dependencies = [ "gix-command", "gix-config-value", "parking_lot 0.12.3", - "rustix 0.38.34", + "rustix 0.38.37", "thiserror", ] @@ -10029,7 +10030,7 @@ version = "0.6.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b2cffa4ad52c6f791f4f8b15f0c05f9824b2ced1160e88cc393d64fff9a8ac64" dependencies = [ - "rustix 0.38.34", + "rustix 0.38.37", ] [[package]] @@ -11673,7 +11674,7 @@ dependencies = [ "concurrent-queue", "hermit-abi 0.4.0", "pin-project-lite", - "rustix 0.38.34", + "rustix 0.38.37", "tracing", "windows-sys 0.59.0", ] @@ -11920,7 +11921,7 @@ dependencies = [ "hex", "lazy_static", "procfs-core", - "rustix 0.38.34", + "rustix 0.38.37", ] [[package]] @@ -13271,9 +13272,9 @@ dependencies = [ [[package]] name = "rustix" -version = "0.38.34" +version = "0.38.37" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "70dc5ec042f7a43c4a73241207cecc9873a06d45debb38b329f8541d85c2730f" +checksum = "8acb788b847c24f28525660c4d7758620a7210875711f79e7f663cc152726811" dependencies = [ "bitflags 2.6.0", "errno", @@ -14619,7 +14620,7 @@ dependencies = [ "cap-std", "fd-lock", "io-lifetimes 2.0.3", - "rustix 0.38.34", + "rustix 0.38.37", "windows-sys 0.52.0", "winx", ] @@ -14812,7 +14813,7 @@ dependencies = [ "cfg-if 1.0.0", "fastrand 2.1.0", "once_cell", - "rustix 0.38.34", + "rustix 0.38.37", "windows-sys 0.59.0", ] @@ -14841,7 +14842,7 @@ version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "21bebf2b7c9e0a515f6e0f8c51dc0f8e4696391e6f1ff30379559f8365fb0df7" dependencies = [ - "rustix 0.38.34", + "rustix 0.38.37", "windows-sys 0.48.0", ] @@ -15912,7 +15913,7 @@ dependencies = [ "io-lifetimes 2.0.3", "log", "once_cell", - "rustix 0.38.34", + "rustix 0.38.37", "system-interface", "thiserror", "tracing", @@ -16072,7 +16073,7 @@ dependencies = [ "once_cell", "paste", "rayon", - "rustix 0.38.34", + "rustix 0.38.37", "semver", "serde", "serde_derive", @@ -16115,7 +16116,7 @@ dependencies = [ "bincode 1.3.3", "directories-next", "log", - "rustix 0.38.34", + "rustix 0.38.37", "serde", "serde_derive", "sha2", @@ -16204,7 +16205,7 @@ dependencies = [ "anyhow", "cc", "cfg-if 1.0.0", - "rustix 0.38.34", + "rustix 0.38.37", "wasmtime-asm-macros", "wasmtime-versioned-export-macros", "windows-sys 0.52.0", @@ -16218,7 +16219,7 @@ checksum = "983ca409f2cd66385ce49486c022da0128acb7910c055beb5230998b49c6084c" dependencies = [ "object 0.33.0", "once_cell", - "rustix 0.38.34", + "rustix 0.38.37", "wasmtime-versioned-export-macros", ] @@ -16251,7 +16252,7 @@ dependencies = [ "memoffset", "paste", "psm", - "rustix 0.38.34", + "rustix 0.38.37", "sptr", "wasm-encoder 0.202.0", "wasmtime-asm-macros", diff --git a/src/common/base/Cargo.toml b/src/common/base/Cargo.toml index b8934c96d1bf..ab434f8558db 100644 --- a/src/common/base/Cargo.toml +++ b/src/common/base/Cargo.toml @@ -50,6 +50,7 @@ prometheus-parse = "0.2.3" rand = { workspace = true, features = ["serde1"] } regex = { workspace = true } replace_with = "0.1.7" +rustix = "0.38.37" semver = { workspace = true } serde = { workspace = true } serde_json = { workspace = true } diff --git a/src/common/base/src/base/dma.rs b/src/common/base/src/base/dma.rs index 6e0d3284c7df..05bd0e9815cb 100644 --- a/src/common/base/src/base/dma.rs +++ b/src/common/base/src/base/dma.rs @@ -21,9 +21,11 @@ use std::io::SeekFrom; use std::ops::Deref; use std::ops::DerefMut; use std::ops::Range; +use std::os::fd::BorrowedFd; use std::os::unix::io::AsRawFd; use std::path::Path; +use rustix::fs::OFlags; use tokio::fs::File; use tokio::io::AsyncSeekExt; @@ -130,7 +132,7 @@ impl DmaFile { async fn open(path: impl AsRef) -> io::Result { let file = File::options() .read(true) - .custom_flags(libc::O_DIRECT) + .custom_flags(OFlags::DIRECT.bits() as i32) .open(path) .await?; @@ -143,7 +145,7 @@ impl DmaFile { .write(true) .create(true) .truncate(true) - .custom_flags(libc::O_DIRECT | libc::O_EXCL) + .custom_flags((OFlags::DIRECT | OFlags::EXCL).bits() as i32) .open(path) .await?; @@ -182,34 +184,31 @@ impl DmaFile { fn write_direct(&mut self) -> io::Result { let buf = self.buffer(); - let rt = unsafe { libc::write(self.fd.as_raw_fd(), buf.as_ptr().cast(), buf.len()) }; - unsafe { self.mut_buffer().set_len(0) } - if rt >= 0 { - Ok(rt as usize) - } else { - Err(io::Error::last_os_error()) + match rustix::io::write(&self.fd, &buf) { + Ok(n) => { + debug_assert_eq!(n, buf.len()); + unsafe { self.mut_buffer().set_len(0) }; + Ok(n) + } + Err(e) => Err(e.into()), } } fn read_direct(&mut self) -> io::Result { - let fd = self.fd.as_raw_fd(); - let buf = self.mut_buffer(); - let rt = unsafe { libc::read(fd, buf.as_mut_ptr().cast(), buf.capacity()) }; - if rt >= 0 { - unsafe { buf.set_len(rt as usize) } - Ok(rt as usize) - } else { - Err(io::Error::last_os_error()) + let Self { fd, buf, .. } = self; + let buf = buf.as_mut().unwrap(); + unsafe { buf.set_len(buf.capacity()) }; + match rustix::io::read(fd, buf) { + Ok(n) => { + unsafe { buf.set_len(n) }; + Ok(n) + } + Err(e) => Err(e.into()), } } - fn truncate(&self, length: usize) -> io::Result { - let rt = unsafe { libc::ftruncate64(self.fd.as_raw_fd(), length as i64) }; - if rt >= 0 { - Ok(rt as usize) - } else { - Err(io::Error::last_os_error()) - } + fn truncate(&self, length: usize) -> io::Result<()> { + rustix::fs::ftruncate(&self.fd, length as u64).map_err(|e| e.into()) } async fn seek(&mut self, pos: SeekFrom) -> io::Result { @@ -238,16 +237,11 @@ async fn open_dma(file: File) -> io::Result { }) } -async fn fstatfs(file: &File) -> io::Result { +async fn fstatfs(file: &File) -> io::Result { let fd = file.as_raw_fd(); asyncify(move || { - let mut statfs = std::mem::MaybeUninit::::uninit(); - let ret = unsafe { libc::fstatfs(fd, statfs.as_mut_ptr()) }; - if ret == -1 { - return Err(io::Error::last_os_error()); - } - - Ok(unsafe { statfs.assume_init() }) + let fd = unsafe { BorrowedFd::borrow_raw(fd) }; + rustix::fs::fstatfs(fd).map_err(|e| e.into()) }) .await } From 121f15f8378c31696abd6789e97c47e31cfca7a7 Mon Sep 17 00:00:00 2001 From: coldWater Date: Thu, 19 Sep 2024 12:20:12 +0800 Subject: [PATCH 13/40] rustix Signed-off-by: coldWater --- Cargo.lock | 1 + src/common/base/src/base/dma.rs | 10 ++++------ src/query/storages/common/cache/Cargo.toml | 1 + 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index f6a12f7e353d..05d1e46cbb8a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5321,6 +5321,7 @@ dependencies = [ "log", "parking_lot 0.12.3", "rayon", + "rustix 0.38.37", "siphasher", "tempfile", ] diff --git a/src/common/base/src/base/dma.rs b/src/common/base/src/base/dma.rs index 05bd0e9815cb..bbf633427f63 100644 --- a/src/common/base/src/base/dma.rs +++ b/src/common/base/src/base/dma.rs @@ -225,10 +225,8 @@ pub fn align_down(alignment: usize, value: usize) -> usize { } async fn open_dma(file: File) -> io::Result { - let statfs = fstatfs(&file).await?; - // TODO: the actual alignment may differ from the optimal io size? we should probably get - // this information from the the device the file lives on. - let alignment = statfs.f_bsize.max(512) as usize; + let stat = fstatvfs(&file).await?; + let alignment = stat.f_bsize.max(512) as usize; Ok(DmaFile { fd: file, @@ -237,11 +235,11 @@ async fn open_dma(file: File) -> io::Result { }) } -async fn fstatfs(file: &File) -> io::Result { +async fn fstatvfs(file: &File) -> io::Result { let fd = file.as_raw_fd(); asyncify(move || { let fd = unsafe { BorrowedFd::borrow_raw(fd) }; - rustix::fs::fstatfs(fd).map_err(|e| e.into()) + rustix::fs::fstatvfs(fd).map_err(|e| e.into()) }) .await } diff --git a/src/query/storages/common/cache/Cargo.toml b/src/query/storages/common/cache/Cargo.toml index d8c5b0bc4b08..8586b589ae10 100644 --- a/src/query/storages/common/cache/Cargo.toml +++ b/src/query/storages/common/cache/Cargo.toml @@ -32,6 +32,7 @@ log = { workspace = true } parking_lot = { workspace = true } rayon = "1.9.0" siphasher = "0.3.10" +rustix = "0.38.37" [dev-dependencies] tempfile = "3.4.0" From 1a1ae2a2d7851e8341b7045cee9090dd917f7cb8 Mon Sep 17 00:00:00 2001 From: coldWater Date: Thu, 19 Sep 2024 12:20:23 +0800 Subject: [PATCH 14/40] temp Signed-off-by: coldWater --- src/query/storages/common/cache/src/lib.rs | 1 + src/query/storages/common/cache/src/temp.rs | 139 ++++++++++++++++++++ 2 files changed, 140 insertions(+) create mode 100644 src/query/storages/common/cache/src/temp.rs diff --git a/src/query/storages/common/cache/src/lib.rs b/src/query/storages/common/cache/src/lib.rs index b9be0ee5e59c..114eb3513f4a 100644 --- a/src/query/storages/common/cache/src/lib.rs +++ b/src/query/storages/common/cache/src/lib.rs @@ -20,6 +20,7 @@ mod caches; mod manager; mod providers; mod read; +mod temp; mod temp_dir; pub use cache::CacheAccessor; diff --git a/src/query/storages/common/cache/src/temp.rs b/src/query/storages/common/cache/src/temp.rs new file mode 100644 index 000000000000..09dea2ca9fc0 --- /dev/null +++ b/src/query/storages/common/cache/src/temp.rs @@ -0,0 +1,139 @@ +// Copyright 2021 Datafuse Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::hash_map::Entry; +use std::collections::HashMap; +use std::ops::Drop; +use std::path::Path; +use std::sync::atomic::AtomicUsize; +use std::sync::atomic::Ordering; +use std::sync::Arc; +use std::sync::Mutex; + +use databend_common_base::base::GlobalUniqName; +use databend_common_exception::Result; + +pub struct TempFileManager { + root: Box, + total_limit: usize, + dir_limit: usize, + reserved: usize, + + group: Mutex, +} + +struct Group { + dirs: HashMap, Arc>, +} + +impl Group { + fn size(&self) -> usize { + self.dirs.iter().map(|(_, v)| *v.size.lock().unwrap()).sum() + } +} + +#[derive(Debug, Default)] +struct DirInfo { + count: AtomicUsize, + size: Mutex, +} + +impl TempFileManager { + pub fn get_dir(self: &Arc, id: &str) -> TempDir { + let path = self.root.join(id).into_boxed_path(); + + let mut group = self.group.lock().unwrap(); + + match group.dirs.entry(path.clone()) { + Entry::Occupied(o) => TempDir { + path, + dir_info: o.get().clone(), + manager: self.clone(), + }, + Entry::Vacant(v) => { + let dir_info = Arc::new(DirInfo::default()); + v.insert(dir_info.clone()); + TempDir { + path, + dir_info, + manager: self.clone(), + } + } + } + } + + // pub fn used(&self,size :usize) -> Result { + // let stat = rustix::fs::statvfs(self.root)?; + // stat.f_bavail > self.reserved + (size +stat.f_frsize-1)/stat.f_frsize + // } +} + +pub struct TempDir { + path: Box, + dir_info: Arc, + manager: Arc, +} + +impl TempDir { + pub fn new_file_with_size(&self, size: usize) -> Option { + let path = self.path.join(GlobalUniqName::unique()).into_boxed_path(); + + let dir_info = self.dir_info.clone(); + + let group = self.manager.group.lock().unwrap(); + let mut dir_size = dir_info.size.lock().unwrap(); + if self.manager.dir_limit < *dir_size + size + || self.manager.total_limit < group.size() + size + { + return None; + } + + *dir_size += size; + drop(dir_size); + + dir_info.count.fetch_add(1, Ordering::SeqCst); + Some(TempFile { + path, + size, + dir_info, + }) + } +} + +#[derive(Debug)] +pub struct TempFile { + path: Box, + size: usize, + dir_info: Arc, +} + +impl Drop for TempFile { + fn drop(&mut self) { + self.dir_info.count.fetch_sub(1, Ordering::SeqCst); + + let mut guard = self.dir_info.size.lock().unwrap(); + *guard -= self.size; + + let _ = std::fs::remove_file(&self.path); + } +} + +#[cfg(test)] +mod tests { + + #[test] + fn test_xxx() { + println!("aa") + } +} From e109d352c2684c2b3aada93044c1262ff9b22fed Mon Sep 17 00:00:00 2001 From: coldWater Date: Thu, 19 Sep 2024 16:08:11 +0800 Subject: [PATCH 15/40] Location Signed-off-by: coldWater --- .../transforms/transform_sort_spill.rs | 5 +- src/query/service/src/spillers/mod.rs | 1 + src/query/service/src/spillers/spiller.rs | 91 +++++++++++++------ 3 files changed, 65 insertions(+), 32 deletions(-) diff --git a/src/query/service/src/pipelines/processors/transforms/transform_sort_spill.rs b/src/query/service/src/pipelines/processors/transforms/transform_sort_spill.rs index 67e9e0d21c4c..004b49862d51 100644 --- a/src/query/service/src/pipelines/processors/transforms/transform_sort_spill.rs +++ b/src/query/service/src/pipelines/processors/transforms/transform_sort_spill.rs @@ -43,6 +43,7 @@ use databend_common_pipeline_transforms::processors::sort::SortSpillMeta; use databend_common_pipeline_transforms::processors::sort::SortSpillMetaWithParams; use databend_common_pipeline_transforms::processors::sort::SortedStream; +use crate::spillers::Location; use crate::spillers::Spiller; enum State { @@ -77,7 +78,7 @@ pub struct TransformSortSpill { /// Blocks to merge one time. num_merge: usize, /// Unmerged list of blocks. Each list are sorted. - unmerged_blocks: VecDeque>, + unmerged_blocks: VecDeque>, /// If `ummerged_blocks.len()` < `num_merge`, /// we can use a final merger to merge the last few sorted streams to reduce IO. @@ -359,7 +360,7 @@ where R: Rows + Sync + Send + 'static } enum BlockStream { - Spilled((VecDeque, Arc)), + Spilled((VecDeque, Arc)), Block(Option), } diff --git a/src/query/service/src/spillers/mod.rs b/src/query/service/src/spillers/mod.rs index 803e61b7b972..e27e973fad4f 100644 --- a/src/query/service/src/spillers/mod.rs +++ b/src/query/service/src/spillers/mod.rs @@ -18,6 +18,7 @@ mod spiller; pub use partition_buffer::PartitionBuffer; pub use partition_buffer::PartitionBufferFetchOption; pub use spiller::DiskSpill; +pub use spiller::Location; pub use spiller::SpilledData; pub use spiller::Spiller; pub use spiller::SpillerConfig; diff --git a/src/query/service/src/spillers/spiller.rs b/src/query/service/src/spillers/spiller.rs index 6fb240dfd2ca..4c810fd13da1 100644 --- a/src/query/service/src/spillers/spiller.rs +++ b/src/query/service/src/spillers/spiller.rs @@ -26,6 +26,7 @@ use std::sync::Mutex; use std::sync::Once; use std::time::Instant; +use databend_common_base::base::dma_read_file; use databend_common_base::base::dma_write_file_vectored; use databend_common_base::base::GlobalUniqName; use databend_common_base::base::ProgressValues; @@ -88,9 +89,9 @@ pub struct Spiller { _spiller_type: SpillerType, pub join_spilling_partition_bits: usize, /// 1 partition -> N partition files - pub partition_location: HashMap>, + pub partition_location: HashMap>, /// Record columns layout for spilled data, will be used when read data from disk - pub columns_layout: HashMap>, + pub columns_layout: HashMap>, /// Record how many bytes have been spilled for each partition. pub partition_spilled_bytes: HashMap, } @@ -121,15 +122,16 @@ impl Spiller { } /// Spill a [`DataBlock`] to storage. - pub async fn spill(&mut self, data_block: DataBlock) -> Result { + pub async fn spill(&mut self, data_block: DataBlock) -> Result { let instant = Instant::now(); // Spill data to storage. let unique_name = GlobalUniqName::unique(); - let location = format!("{}/{}", self.config.location_prefix, unique_name); + let location = format!("{}/{unique_name}", self.config.location_prefix); let encoded = EncodedBlock::from_block(&data_block); let columns_layout = encoded.columns_layout(); let data_size = write_encodes_to_storage(&self.operator, &location, vec![encoded]).await?; + let location = Location::Storage(location); // Record statistics. record_write_profile(&instant, data_size); @@ -194,7 +196,7 @@ impl Spiller { // Spill data to storage. let instant = Instant::now(); let unique_name = GlobalUniqName::unique(); - let location = format!("{}/{}", self.config.location_prefix, unique_name); + let location = format!("{}/{unique_name}", self.config.location_prefix); write_encodes_to_storage(&self.operator, &location, write_data).await?; @@ -202,35 +204,43 @@ impl Spiller { record_write_profile(&instant, write_bytes); Ok(SpilledData::MergedPartition { - location, + location: Location::Storage(location), partitions: spilled_partitions, }) } /// Read a certain file to a [`DataBlock`]. /// We should guarantee that the file is managed by this spiller. - pub async fn read_spilled_file(&self, file: &str) -> Result { - debug_assert!(self.columns_layout.contains_key(file)); + pub async fn read_spilled_file(&self, location: &Location) -> Result { + let columns_layout = self.columns_layout.get(location).unwrap(); // Read spilled data from storage. let instant = Instant::now(); - let data = self.operator.read(file).await?.to_bytes(); + let data = match location { + Location::Storage(loc) => self.operator.read(loc).await?.to_bytes(), + Location::Disk(path) => { + let cap = columns_layout.iter().sum(); + let mut data = Vec::with_capacity(cap); + dma_read_file(path, &mut data).await?; + data.into() + } + }; // Record statistics. record_read_profile(&instant, data.len()); // Deserialize data block. - let columns_layout = self.columns_layout.get(file).unwrap(); Ok(deserialize_block(columns_layout, &data)) } #[async_backtrace::framed] /// Read spilled data with partition id pub async fn read_spilled_partition(&mut self, p_id: &usize) -> Result> { - if let Some(files) = self.partition_location.get(p_id) { - let mut spilled_data = Vec::with_capacity(files.len()); - for file in files.iter() { - let block = self.read_spilled_file(file).await?; + if let Some(locs) = self.partition_location.get(p_id) { + let mut spilled_data = Vec::with_capacity(locs.len()); + for loc in locs.iter() { + let block = self.read_spilled_file(loc).await?; + if block.num_rows() != 0 { spilled_data.push(block); } @@ -252,7 +262,21 @@ impl Spiller { { // Read spilled data from storage. let instant = Instant::now(); - let data = self.operator.read(location).await?.to_bytes(); + + let data = match location { + Location::Storage(loc) => self.operator.read(loc).await?.to_bytes(), + Location::Disk(path) => { + let cap = if let Some((_, range, _)) = partitions.last() { + range.end + } else { + 0 + }; + + let mut data = Vec::with_capacity(cap); + dma_read_file(path, &mut data).await?; + data.into() + } + }; // Record statistics. record_read_profile(&instant, data.len()); @@ -273,18 +297,25 @@ impl Spiller { pub async fn read_range( &self, - location: &str, + location: &Location, data_range: Range, columns_layout: &[usize], ) -> Result { // Read spilled data from storage. let instant = Instant::now(); - let data = self - .operator - .read_with(location) - .range(data_range.start as u64..data_range.end as u64) - .await? - .to_bytes(); + + let data = match location { + Location::Storage(loc) => { + let range = data_range.start as u64..data_range.end as u64; + self.operator.read_with(loc).range(range).await?.to_bytes() + } + Location::Disk(path) => { + let cap = columns_layout.iter().sum(); + let mut data = Vec::with_capacity(cap); + dma_read_file(path, &mut data).await?; + data.into() + } + }; // Record statistics. record_read_profile(&instant, data.len()); @@ -293,7 +324,7 @@ impl Spiller { Ok(deserialize_block(columns_layout, &data)) } - pub(crate) fn spilled_files(&self) -> Vec { + pub(crate) fn spilled_files(&self) -> Vec { self.columns_layout.keys().cloned().collect() } } @@ -301,11 +332,17 @@ impl Spiller { pub enum SpilledData { Partition(String), MergedPartition { - location: String, + location: Location, partitions: Vec<(usize, Range, Vec)>, }, } +#[derive(Debug, Clone, Hash, PartialEq, Eq)] +pub enum Location { + Storage(String), + Disk(Box), +} + pub struct EncodedBlock(pub Vec>); impl EncodedBlock { @@ -383,12 +420,6 @@ impl DiskSpill { } } -#[derive(Debug, Clone)] -pub enum Location { - Storage(String), - Disk(PathBuf), -} - pub async fn write_encodes_to_storage( operator: &Operator, path: &str, From f4c844fdde9db4f05df57473063b91adf20aec23 Mon Sep 17 00:00:00 2001 From: coldWater Date: Thu, 19 Sep 2024 18:35:22 +0800 Subject: [PATCH 16/40] builder Signed-off-by: coldWater --- Cargo.lock | 1 - .../src/pipelines/builders/builder_sort.rs | 9 +- .../src/pipelines/builders/builder_window.rs | 55 +++++----- .../transforms/hash_join/hash_join_spiller.rs | 2 +- .../transforms/transform_sort_spill.rs | 1 + .../transform_window_partition_collect.rs | 25 ++++- .../partition/window_partition_buffer.rs | 20 +--- src/query/service/src/spillers/spiller.rs | 101 +++++++++++------- .../service/tests/it/spillers/spiller.rs | 14 ++- src/query/storages/common/cache/Cargo.toml | 1 - 10 files changed, 138 insertions(+), 91 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index a16c0aa34de3..a78b75e21b5c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5402,7 +5402,6 @@ dependencies = [ "log", "parking_lot 0.12.3", "rayon", - "rustix 0.38.37", "siphasher", "tempfile", ] diff --git a/src/query/service/src/pipelines/builders/builder_sort.rs b/src/query/service/src/pipelines/builders/builder_sort.rs index 5ae5aaa1b345..5d22f8b0b392 100644 --- a/src/query/service/src/pipelines/builders/builder_sort.rs +++ b/src/query/service/src/pipelines/builders/builder_sort.rs @@ -291,8 +291,13 @@ impl SortPipelineBuilder { )); pipeline.add_transform(|input, output| { let op = DataOperator::instance().operator(); - let spiller = - Spiller::create(self.ctx.clone(), op, config.clone(), SpillerType::OrderBy)?; + let spiller = Spiller::create( + self.ctx.clone(), + op, + config.clone(), + None, + SpillerType::OrderBy, + )?; Ok(ProcessorPtr::create(create_transform_sort_spill( input, output, diff --git a/src/query/service/src/pipelines/builders/builder_window.rs b/src/query/service/src/pipelines/builders/builder_window.rs index f6854da31022..4b7e579e9b05 100644 --- a/src/query/service/src/pipelines/builders/builder_window.rs +++ b/src/query/service/src/pipelines/builders/builder_window.rs @@ -12,8 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::sync::atomic::AtomicUsize; - use databend_common_catalog::table_context::TableContext; use databend_common_exception::Result; use databend_common_expression::types::DataType; @@ -25,6 +23,7 @@ use databend_common_pipeline_core::processors::ProcessorPtr; use databend_common_pipeline_core::Pipe; use databend_common_sql::executor::physical_plans::Window; use databend_common_sql::executor::physical_plans::WindowPartition; +use databend_storages_common_cache::TempDirManager; use crate::pipelines::processors::transforms::FrameBound; use crate::pipelines::processors::transforms::TransformWindowPartitionCollect; @@ -143,11 +142,6 @@ impl PipelineBuilder { // Settings. let settings = self.ctx.get_settings(); let num_partitions = settings.get_window_num_partitions()?; - let max_block_size = settings.get_max_block_size()? as usize; - let sort_block_size = settings.get_window_partition_sort_block_size()? as usize; - let sort_spilling_batch_bytes = settings.get_sort_spilling_batch_bytes()?; - let enable_loser_tree = settings.get_enable_loser_tree_merge_sort()?; - let window_spill_settings = WindowSpillSettings::new(settings.clone(), num_processors)?; let plan_schema = window_partition.output_schema()?; let partition_by = window_partition @@ -155,20 +149,6 @@ impl PipelineBuilder { .iter() .map(|index| plan_schema.index_of(&index.to_string())) .collect::>>()?; - let sort_desc = window_partition - .order_by - .iter() - .map(|desc| { - let offset = plan_schema.index_of(&desc.order_by.to_string())?; - Ok(SortColumnDescription { - offset, - asc: desc.asc, - nulls_first: desc.nulls_first, - is_nullable: plan_schema.field(offset).is_nullable(), - }) - }) - .collect::>>()?; - let have_order_col = window_partition.after_exchange.unwrap_or(false); // 1. Build window partition scatter processors. let mut pipe_items = Vec::with_capacity(num_processors); @@ -195,16 +175,43 @@ impl PipelineBuilder { } self.main_pipeline.reorder_inputs(rule); + let max_block_size = settings.get_max_block_size()? as usize; + let sort_block_size = settings.get_window_partition_sort_block_size()? as usize; + let sort_spilling_batch_bytes = settings.get_sort_spilling_batch_bytes()?; + let enable_loser_tree = settings.get_enable_loser_tree_merge_sort()?; + let have_order_col = window_partition.after_exchange.unwrap_or(false); + let sort_desc = window_partition + .order_by + .iter() + .map(|desc| { + let offset = plan_schema.index_of(&desc.order_by.to_string())?; + Ok(SortColumnDescription { + offset, + asc: desc.asc, + nulls_first: desc.nulls_first, + is_nullable: plan_schema.field(offset).is_nullable(), + }) + }) + .collect::>>()?; + let disk_bytes_limit = settings.get_window_partition_spilling_to_disk_bytes_limit()?; + let disk_spill = TempDirManager::instance() + .get_disk_spill_config() + .map(|cfg| { + let root = cfg.path.join(self.ctx.get_id()); + DiskSpill::new(root, disk_bytes_limit as isize) + }); + let window_spill_settings = WindowSpillSettings::new(&settings, num_processors)?; + // 3. Build window partition collect processors. - let processor_id = AtomicUsize::new(0); let mut pipe_items = Vec::with_capacity(num_processors); - for _ in 0..num_processors { + for processor_id in 0..num_processors { let processor = TransformWindowPartitionCollect::new( self.ctx.clone(), - processor_id.fetch_add(1, std::sync::atomic::Ordering::AcqRel), + processor_id, num_processors, num_partitions, window_spill_settings.clone(), + disk_spill.clone(), sort_desc.clone(), plan_schema.clone(), max_block_size, diff --git a/src/query/service/src/pipelines/processors/transforms/hash_join/hash_join_spiller.rs b/src/query/service/src/pipelines/processors/transforms/hash_join/hash_join_spiller.rs index c9c0a9977341..c858ffb98dc9 100644 --- a/src/query/service/src/pipelines/processors/transforms/hash_join/hash_join_spiller.rs +++ b/src/query/service/src/pipelines/processors/transforms/hash_join/hash_join_spiller.rs @@ -72,7 +72,7 @@ impl HashJoinSpiller { } else { SpillerType::HashJoinProbe }; - let spiller = Spiller::create(ctx.clone(), operator, spill_config, spiller_type)?; + let spiller = Spiller::create(ctx.clone(), operator, spill_config, None, spiller_type)?; let num_partitions = (1 << spill_partition_bits) as usize; // The memory threshold of each partition, we will spill the partition data diff --git a/src/query/service/src/pipelines/processors/transforms/transform_sort_spill.rs b/src/query/service/src/pipelines/processors/transforms/transform_sort_spill.rs index 004b49862d51..37823566665e 100644 --- a/src/query/service/src/pipelines/processors/transforms/transform_sort_spill.rs +++ b/src/query/service/src/pipelines/processors/transforms/transform_sort_spill.rs @@ -490,6 +490,7 @@ mod tests { ctx.clone(), op, SpillerConfig::create("_spill_test".to_string()), + None, SpillerType::OrderBy, )?; diff --git a/src/query/service/src/pipelines/processors/transforms/window/partition/transform_window_partition_collect.rs b/src/query/service/src/pipelines/processors/transforms/window/partition/transform_window_partition_collect.rs index 0681acce0243..dbf47d38110f 100644 --- a/src/query/service/src/pipelines/processors/transforms/window/partition/transform_window_partition_collect.rs +++ b/src/query/service/src/pipelines/processors/transforms/window/partition/transform_window_partition_collect.rs @@ -30,13 +30,20 @@ use databend_common_pipeline_core::processors::InputPort; use databend_common_pipeline_core::processors::OutputPort; use databend_common_pipeline_core::processors::Processor; use databend_common_pipeline_core::processors::ProcessorPtr; +use databend_common_pipeline_core::query_spill_prefix; use databend_common_pipeline_core::PipeItem; use databend_common_pipeline_transforms::processors::sort_merge; +use databend_common_storage::DataOperator; +use databend_common_storages_fuse::TableContext; use super::WindowPartitionBuffer; use super::WindowPartitionMeta; use super::WindowSpillSettings; use crate::sessions::QueryContext; +use crate::spillers::DiskSpill; +use crate::spillers::Spiller; +use crate::spillers::SpillerConfig; +use crate::spillers::SpillerType; #[derive(Debug, Clone, Copy)] pub enum Step { @@ -90,6 +97,7 @@ impl TransformWindowPartitionCollect { num_processors: usize, num_partitions: usize, spill_settings: WindowSpillSettings, + disk_spill: Option>, sort_desc: Vec, schema: DataSchemaRef, max_block_size: usize, @@ -112,9 +120,24 @@ impl TransformWindowPartitionCollect { partition_id[*partition] = new_partition_id; } + let spill_config = SpillerConfig::create(query_spill_prefix( + ctx.get_tenant().tenant_name(), + &ctx.get_id(), + )); + + // Create an inner `Spiller` to spill data. + let operator = DataOperator::instance().operator(); + let spiller = Spiller::create( + ctx.clone(), + operator, + spill_config, + disk_spill, + SpillerType::Window, + )?; + // Create the window partition buffer. let buffer = - WindowPartitionBuffer::new(ctx, partitions.len(), sort_block_size, spill_settings)?; + WindowPartitionBuffer::new(spiller, num_partitions, sort_block_size, spill_settings)?; Ok(Self { inputs, diff --git a/src/query/service/src/pipelines/processors/transforms/window/partition/window_partition_buffer.rs b/src/query/service/src/pipelines/processors/transforms/window/partition/window_partition_buffer.rs index dd93b0016c3c..28c6b9b2068d 100644 --- a/src/query/service/src/pipelines/processors/transforms/window/partition/window_partition_buffer.rs +++ b/src/query/service/src/pipelines/processors/transforms/window/partition/window_partition_buffer.rs @@ -12,23 +12,15 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::sync::Arc; - use databend_common_base::runtime::GLOBAL_MEM_STAT; use databend_common_exception::Result; use databend_common_expression::DataBlock; -use databend_common_pipeline_core::query_spill_prefix; use databend_common_settings::Settings; -use databend_common_storage::DataOperator; -use databend_common_storages_fuse::TableContext; -use crate::sessions::QueryContext; use crate::spillers::PartitionBuffer; use crate::spillers::PartitionBufferFetchOption; use crate::spillers::SpilledData; use crate::spillers::Spiller; -use crate::spillers::SpillerConfig; -use crate::spillers::SpillerType; /// The `WindowPartitionBuffer` is used to control memory usage of Window operator. pub struct WindowPartitionBuffer { @@ -46,19 +38,11 @@ pub struct WindowPartitionBuffer { impl WindowPartitionBuffer { pub fn new( - ctx: Arc, + spiller: Spiller, num_partitions: usize, sort_block_size: usize, spill_settings: WindowSpillSettings, ) -> Result { - // Create an inner `Spiller` to spill data. - let spill_config = SpillerConfig::create(query_spill_prefix( - ctx.get_tenant().tenant_name(), - &ctx.get_id(), - )); - let operator = DataOperator::instance().operator(); - let spiller = Spiller::create(ctx.clone(), operator, spill_config, SpillerType::Window)?; - // Create a `PartitionBuffer` to store partitioned data. let partition_buffer = PartitionBuffer::create(num_partitions); let restored_partition_buffer = PartitionBuffer::create(num_partitions); @@ -296,7 +280,7 @@ pub struct WindowSpillSettings { } impl WindowSpillSettings { - pub fn new(settings: Arc, num_threads: usize) -> Result { + pub fn new(settings: &Settings, num_threads: usize) -> Result { let global_memory_ratio = std::cmp::min(settings.get_window_partition_spilling_memory_ratio()?, 100) as f64 / 100_f64; diff --git a/src/query/service/src/spillers/spiller.rs b/src/query/service/src/spillers/spiller.rs index 4c810fd13da1..c17935a14da6 100644 --- a/src/query/service/src/spillers/spiller.rs +++ b/src/query/service/src/spillers/spiller.rs @@ -86,6 +86,7 @@ pub struct Spiller { ctx: Arc, operator: Operator, config: SpillerConfig, + disk_spill: Option>, _spiller_type: SpillerType, pub join_spilling_partition_bits: usize, /// 1 partition -> N partition files @@ -102,6 +103,7 @@ impl Spiller { ctx: Arc, operator: Operator, config: SpillerConfig, + disk_spill: Option>, spiller_type: SpillerType, ) -> Result { let join_spilling_partition_bits = ctx.get_settings().get_join_spilling_partition_bits()?; @@ -109,6 +111,7 @@ impl Spiller { ctx: ctx.clone(), operator, config, + disk_spill, _spiller_type: spiller_type, join_spilling_partition_bits, partition_location: Default::default(), @@ -126,12 +129,10 @@ impl Spiller { let instant = Instant::now(); // Spill data to storage. - let unique_name = GlobalUniqName::unique(); - let location = format!("{}/{unique_name}", self.config.location_prefix); let encoded = EncodedBlock::from_block(&data_block); let columns_layout = encoded.columns_layout(); - let data_size = write_encodes_to_storage(&self.operator, &location, vec![encoded]).await?; - let location = Location::Storage(location); + let data_size = encoded.size(); + let location = self.write_encodes(data_size, vec![encoded]).await?; // Record statistics. record_write_profile(&instant, data_size); @@ -195,16 +196,13 @@ impl Spiller { // Spill data to storage. let instant = Instant::now(); - let unique_name = GlobalUniqName::unique(); - let location = format!("{}/{unique_name}", self.config.location_prefix); - - write_encodes_to_storage(&self.operator, &location, write_data).await?; + let location = self.write_encodes(write_bytes, write_data).await?; // Record statistics. record_write_profile(&instant, write_bytes); Ok(SpilledData::MergedPartition { - location: Location::Storage(location), + location, partitions: spilled_partitions, }) } @@ -324,13 +322,64 @@ impl Spiller { Ok(deserialize_block(columns_layout, &data)) } + async fn write_encodes(&mut self, size: usize, blocks: Vec) -> Result { + let unique_name = GlobalUniqName::unique(); + let location = match &self.disk_spill { + None => None, + Some(disk) => { + if disk.can_write(size as isize) { + disk.init()?; + Some(Location::Disk( + disk.root.join(unique_name.clone()).into_boxed_path(), + )) + } else { + None + } + } + } + .unwrap_or(Location::Storage(format!( + "{}/{unique_name}", + self.config.location_prefix + ))); + + let written = match &location { + Location::Storage(loc) => { + let mut writer = self + .operator + .writer_with(loc) + .chunk(8 * 1024 * 1024) + .await?; + + let mut written = 0; + for data in blocks.into_iter().flat_map(|x| x.0) { + written += data.len(); + writer.write(data).await?; + } + + writer.close().await?; + written + } + Location::Disk(path) => { + let bufs = blocks + .iter() + .flat_map(|x| &x.0) + .map(|data| io::IoSlice::new(data)) + .collect::>(); + + dma_write_file_vectored(path, &bufs).await? + } + }; + debug_assert_eq!(size, written); + Ok(location) + } + pub(crate) fn spilled_files(&self) -> Vec { self.columns_layout.keys().cloned().collect() } } pub enum SpilledData { - Partition(String), + Partition(Location), MergedPartition { location: Location, partitions: Vec<(usize, Range, Vec)>, @@ -397,7 +446,7 @@ impl DiskSpill { }) } - pub fn try_write(&self, size: isize) -> bool { + pub fn can_write(&self, size: isize) -> bool { let mut guard = self.bytes_limit.lock().unwrap(); if *guard > size { *guard -= size; @@ -420,36 +469,6 @@ impl DiskSpill { } } -pub async fn write_encodes_to_storage( - operator: &Operator, - path: &str, - write_data: Vec, -) -> Result { - let mut writer = operator.writer_with(path).chunk(8 * 1024 * 1024).await?; - - let mut written = 0; - for data in write_data.into_iter().flat_map(|x| x.0) { - written += data.len(); - writer.write(data).await?; - } - - writer.close().await?; - Ok(written) -} - -pub async fn write_encodeds_to_disk( - path: impl AsRef, - write_data: Vec, -) -> io::Result { - let bufs = write_data - .iter() - .flat_map(|x| &x.0) - .map(|data| io::IoSlice::new(data)) - .collect::>(); - - dma_write_file_vectored(path, &bufs).await -} - pub fn record_write_profile(start: &Instant, write_bytes: usize) { Profile::record_usize_profile(ProfileStatisticsName::SpillWriteCount, 1); Profile::record_usize_profile(ProfileStatisticsName::SpillWriteBytes, write_bytes); diff --git a/src/query/service/tests/it/spillers/spiller.rs b/src/query/service/tests/it/spillers/spiller.rs index 387ccd5a4062..3e627bc99caa 100644 --- a/src/query/service/tests/it/spillers/spiller.rs +++ b/src/query/service/tests/it/spillers/spiller.rs @@ -12,6 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. +use std::assert_matches::assert_matches; + use databend_common_base::base::tokio; use databend_common_catalog::table_context::TableContext; use databend_common_exception::Result; @@ -24,6 +26,7 @@ use databend_common_expression::FromData; use databend_common_expression::ScalarRef; use databend_common_pipeline_core::query_spill_prefix; use databend_common_storage::DataOperator; +use databend_query::spillers::Location; use databend_query::spillers::Spiller; use databend_query::spillers::SpillerConfig; use databend_query::spillers::SpillerType; @@ -39,7 +42,13 @@ async fn test_spill_with_partition() -> Result<()> { SpillerConfig::create(query_spill_prefix(tenant.tenant_name(), &ctx.get_id())); let operator = DataOperator::instance().operator(); - let mut spiller = Spiller::create(ctx, operator, spiller_config, SpillerType::HashJoinBuild)?; + let mut spiller = Spiller::create( + ctx, + operator, + spiller_config, + None, + SpillerType::HashJoinBuild, + )?; // Generate data block: two columns, type is i32, 100 rows let data = DataBlock::new_from_columns(vec![ @@ -50,7 +59,8 @@ async fn test_spill_with_partition() -> Result<()> { let res = spiller.spill_with_partition(0, data).await; assert!(res.is_ok()); - assert!(spiller.partition_location.get(&0).unwrap()[0].starts_with("_query_spill")); + let location = &spiller.partition_location.get(&0).unwrap()[0]; + assert_matches!(location, Location::Storage(_)); // Test read spilled data let block = DataBlock::concat(&spiller.read_spilled_partition(&(0)).await?)?; diff --git a/src/query/storages/common/cache/Cargo.toml b/src/query/storages/common/cache/Cargo.toml index 8586b589ae10..d8c5b0bc4b08 100644 --- a/src/query/storages/common/cache/Cargo.toml +++ b/src/query/storages/common/cache/Cargo.toml @@ -32,7 +32,6 @@ log = { workspace = true } parking_lot = { workspace = true } rayon = "1.9.0" siphasher = "0.3.10" -rustix = "0.38.37" [dev-dependencies] tempfile = "3.4.0" From a525891a021df3e6f54f359c357b7944a8f77253 Mon Sep 17 00:00:00 2001 From: coldWater Date: Thu, 19 Sep 2024 20:48:13 +0800 Subject: [PATCH 17/40] fix Signed-off-by: coldWater --- src/query/service/src/spillers/spiller.rs | 30 ++++++++++++----------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/src/query/service/src/spillers/spiller.rs b/src/query/service/src/spillers/spiller.rs index c17935a14da6..994cedd82d60 100644 --- a/src/query/service/src/spillers/spiller.rs +++ b/src/query/service/src/spillers/spiller.rs @@ -27,6 +27,7 @@ use std::sync::Once; use std::time::Instant; use databend_common_base::base::dma_read_file; +use databend_common_base::base::dma_read_file_range; use databend_common_base::base::dma_write_file_vectored; use databend_common_base::base::GlobalUniqName; use databend_common_base::base::ProgressValues; @@ -301,25 +302,26 @@ impl Spiller { ) -> Result { // Read spilled data from storage. let instant = Instant::now(); + let data_range = data_range.start as u64..data_range.end as u64; - let data = match location { + match location { Location::Storage(loc) => { - let range = data_range.start as u64..data_range.end as u64; - self.operator.read_with(loc).range(range).await?.to_bytes() + let data = self + .operator + .read_with(loc) + .range(data_range) + .await? + .to_bytes(); + record_read_profile(&instant, data.len()); + Ok(deserialize_block(columns_layout, &data)) } Location::Disk(path) => { - let cap = columns_layout.iter().sum(); - let mut data = Vec::with_capacity(cap); - dma_read_file(path, &mut data).await?; - data.into() + let (buf, range) = dma_read_file_range(path, data_range).await?; + let data = &buf[range]; + record_read_profile(&instant, data.len()); + Ok(deserialize_block(columns_layout, &data)) } - }; - - // Record statistics. - record_read_profile(&instant, data.len()); - - // Deserialize data block. - Ok(deserialize_block(columns_layout, &data)) + } } async fn write_encodes(&mut self, size: usize, blocks: Vec) -> Result { From dbf516b6938dbca923eed2642efd5e7f5bf79e2a Mon Sep 17 00:00:00 2001 From: coldWater Date: Thu, 19 Sep 2024 21:54:44 +0800 Subject: [PATCH 18/40] SpillerConfig Signed-off-by: coldWater --- .../src/pipelines/builders/builder_sort.rs | 20 +++++++-------- .../transforms/hash_join/hash_join_spiller.rs | 13 +++++----- .../transforms/transform_sort_spill.rs | 14 +++++------ .../transform_window_partition_collect.rs | 17 +++++-------- src/query/service/src/spillers/spiller.rs | 25 +++++++++---------- .../service/tests/it/spillers/spiller.rs | 15 +++++------ 6 files changed, 47 insertions(+), 57 deletions(-) diff --git a/src/query/service/src/pipelines/builders/builder_sort.rs b/src/query/service/src/pipelines/builders/builder_sort.rs index 5d22f8b0b392..de4a79597f0e 100644 --- a/src/query/service/src/pipelines/builders/builder_sort.rs +++ b/src/query/service/src/pipelines/builders/builder_sort.rs @@ -285,19 +285,17 @@ impl SortPipelineBuilder { if may_spill { let schema = add_order_field(sort_merge_output_schema.clone(), &self.sort_desc); - let config = SpillerConfig::create(query_spill_prefix( - self.ctx.get_tenant().tenant_name(), - &self.ctx.get_id(), - )); + let config = SpillerConfig { + location_prefix: query_spill_prefix( + self.ctx.get_tenant().tenant_name(), + &self.ctx.get_id(), + ), + disk_spill: None, + spiller_type: SpillerType::OrderBy, + }; pipeline.add_transform(|input, output| { let op = DataOperator::instance().operator(); - let spiller = Spiller::create( - self.ctx.clone(), - op, - config.clone(), - None, - SpillerType::OrderBy, - )?; + let spiller = Spiller::create(self.ctx.clone(), op, config.clone())?; Ok(ProcessorPtr::create(create_transform_sort_spill( input, output, diff --git a/src/query/service/src/pipelines/processors/transforms/hash_join/hash_join_spiller.rs b/src/query/service/src/pipelines/processors/transforms/hash_join/hash_join_spiller.rs index c858ffb98dc9..876542882644 100644 --- a/src/query/service/src/pipelines/processors/transforms/hash_join/hash_join_spiller.rs +++ b/src/query/service/src/pipelines/processors/transforms/hash_join/hash_join_spiller.rs @@ -62,17 +62,18 @@ impl HashJoinSpiller { is_build_side: bool, ) -> Result { // Create a Spiller for spilling build side data. - let spill_config = SpillerConfig::create(query_spill_prefix( - ctx.get_tenant().tenant_name(), - &ctx.get_id(), - )); - let operator = DataOperator::instance().operator(); let spiller_type = if is_build_side { SpillerType::HashJoinBuild } else { SpillerType::HashJoinProbe }; - let spiller = Spiller::create(ctx.clone(), operator, spill_config, None, spiller_type)?; + let spill_config = SpillerConfig { + location_prefix: query_spill_prefix(ctx.get_tenant().tenant_name(), &ctx.get_id()), + disk_spill: None, + spiller_type, + }; + let operator = DataOperator::instance().operator(); + let spiller = Spiller::create(ctx.clone(), operator, spill_config)?; let num_partitions = (1 << spill_partition_bits) as usize; // The memory threshold of each partition, we will spill the partition data diff --git a/src/query/service/src/pipelines/processors/transforms/transform_sort_spill.rs b/src/query/service/src/pipelines/processors/transforms/transform_sort_spill.rs index 37823566665e..f0c6bd97d556 100644 --- a/src/query/service/src/pipelines/processors/transforms/transform_sort_spill.rs +++ b/src/query/service/src/pipelines/processors/transforms/transform_sort_spill.rs @@ -486,13 +486,13 @@ mod tests { limit: Option, ) -> Result>> { let op = DataOperator::instance().operator(); - let spiller = Spiller::create( - ctx.clone(), - op, - SpillerConfig::create("_spill_test".to_string()), - None, - SpillerType::OrderBy, - )?; + let spill_config = SpillerConfig { + location_prefix: "_spill_test".to_string(), + disk_spill: None, + spiller_type: SpillerType::OrderBy, + }; + + let spiller = Spiller::create(ctx.clone(), op, spill_config)?; let sort_desc = Arc::new(vec![SortColumnDescription { offset: 0, diff --git a/src/query/service/src/pipelines/processors/transforms/window/partition/transform_window_partition_collect.rs b/src/query/service/src/pipelines/processors/transforms/window/partition/transform_window_partition_collect.rs index dbf47d38110f..c9c8d9ea0117 100644 --- a/src/query/service/src/pipelines/processors/transforms/window/partition/transform_window_partition_collect.rs +++ b/src/query/service/src/pipelines/processors/transforms/window/partition/transform_window_partition_collect.rs @@ -120,20 +120,15 @@ impl TransformWindowPartitionCollect { partition_id[*partition] = new_partition_id; } - let spill_config = SpillerConfig::create(query_spill_prefix( - ctx.get_tenant().tenant_name(), - &ctx.get_id(), - )); + let spill_config = SpillerConfig { + location_prefix: query_spill_prefix(ctx.get_tenant().tenant_name(), &ctx.get_id()), + disk_spill, + spiller_type: SpillerType::Window, + }; // Create an inner `Spiller` to spill data. let operator = DataOperator::instance().operator(); - let spiller = Spiller::create( - ctx.clone(), - operator, - spill_config, - disk_spill, - SpillerType::Window, - )?; + let spiller = Spiller::create(ctx.clone(), operator, spill_config)?; // Create the window partition buffer. let buffer = diff --git a/src/query/service/src/spillers/spiller.rs b/src/query/service/src/spillers/spiller.rs index 994cedd82d60..164a0312ef86 100644 --- a/src/query/service/src/spillers/spiller.rs +++ b/src/query/service/src/spillers/spiller.rs @@ -68,12 +68,8 @@ impl Display for SpillerType { #[derive(Clone)] pub struct SpillerConfig { pub location_prefix: String, -} - -impl SpillerConfig { - pub fn create(location_prefix: String) -> Self { - Self { location_prefix } - } + pub disk_spill: Option>, + pub spiller_type: SpillerType, } /// Spiller is a unified framework for operators which need to spill data from memory. @@ -86,7 +82,7 @@ impl SpillerConfig { pub struct Spiller { ctx: Arc, operator: Operator, - config: SpillerConfig, + location_prefix: String, disk_spill: Option>, _spiller_type: SpillerType, pub join_spilling_partition_bits: usize, @@ -104,14 +100,17 @@ impl Spiller { ctx: Arc, operator: Operator, config: SpillerConfig, - disk_spill: Option>, - spiller_type: SpillerType, ) -> Result { let join_spilling_partition_bits = ctx.get_settings().get_join_spilling_partition_bits()?; + let SpillerConfig { + location_prefix, + disk_spill, + spiller_type, + } = config; Ok(Self { - ctx: ctx.clone(), + ctx, operator, - config, + location_prefix, disk_spill, _spiller_type: spiller_type, join_spilling_partition_bits, @@ -319,7 +318,7 @@ impl Spiller { let (buf, range) = dma_read_file_range(path, data_range).await?; let data = &buf[range]; record_read_profile(&instant, data.len()); - Ok(deserialize_block(columns_layout, &data)) + Ok(deserialize_block(columns_layout, data)) } } } @@ -341,7 +340,7 @@ impl Spiller { } .unwrap_or(Location::Storage(format!( "{}/{unique_name}", - self.config.location_prefix + self.location_prefix ))); let written = match &location { diff --git a/src/query/service/tests/it/spillers/spiller.rs b/src/query/service/tests/it/spillers/spiller.rs index 3e627bc99caa..b7f2f11ecfec 100644 --- a/src/query/service/tests/it/spillers/spiller.rs +++ b/src/query/service/tests/it/spillers/spiller.rs @@ -38,17 +38,14 @@ async fn test_spill_with_partition() -> Result<()> { let ctx = fixture.new_query_ctx().await?; let tenant = ctx.get_tenant(); - let spiller_config = - SpillerConfig::create(query_spill_prefix(tenant.tenant_name(), &ctx.get_id())); + let spiller_config = SpillerConfig { + location_prefix: query_spill_prefix(tenant.tenant_name(), &ctx.get_id()), + disk_spill: None, + spiller_type: SpillerType::HashJoinBuild, + }; let operator = DataOperator::instance().operator(); - let mut spiller = Spiller::create( - ctx, - operator, - spiller_config, - None, - SpillerType::HashJoinBuild, - )?; + let mut spiller = Spiller::create(ctx, operator, spiller_config)?; // Generate data block: two columns, type is i32, 100 rows let data = DataBlock::new_from_columns(vec![ From 20fd4a6740ae83604ffe7f95defc2a24e9cb1f79 Mon Sep 17 00:00:00 2001 From: coldWater Date: Fri, 20 Sep 2024 18:53:45 +0800 Subject: [PATCH 19/40] temp dir Signed-off-by: coldWater --- Cargo.lock | 1 + src/query/config/src/config.rs | 28 +- src/query/config/src/inner.rs | 10 + src/query/service/src/global_services.rs | 5 +- .../src/interpreters/hook/vacuum_hook.rs | 12 +- .../src/pipelines/builders/builder_window.rs | 19 +- .../transform_window_partition_collect.rs | 18 +- src/query/service/src/spillers/mod.rs | 1 - src/query/service/src/spillers/spiller.rs | 87 ++--- src/query/storages/common/cache/Cargo.toml | 1 + src/query/storages/common/cache/src/lib.rs | 5 +- src/query/storages/common/cache/src/temp.rs | 138 -------- .../storages/common/cache/src/temp_dir.rs | 297 ++++++++++++++++-- 13 files changed, 357 insertions(+), 265 deletions(-) delete mode 100644 src/query/storages/common/cache/src/temp.rs diff --git a/Cargo.lock b/Cargo.lock index a78b75e21b5c..a16c0aa34de3 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5402,6 +5402,7 @@ dependencies = [ "log", "parking_lot 0.12.3", "rayon", + "rustix 0.38.37", "siphasher", "tempfile", ] diff --git a/src/query/config/src/config.rs b/src/query/config/src/config.rs index fdaac90bfbc0..13c8a4bde12d 100644 --- a/src/query/config/src/config.rs +++ b/src/query/config/src/config.rs @@ -25,6 +25,7 @@ use clap::Parser; use clap::Subcommand; use clap::ValueEnum; use databend_common_base::base::mask_string; +use databend_common_base::base::OrderedFloat; use databend_common_exception::ErrorCode; use databend_common_exception::Result; use databend_common_meta_app::principal::UserSettingValue; @@ -2944,6 +2945,14 @@ pub struct SpillConfig { default_value = "./.databend/temp/_query_spill" )] pub spill_local_disk_path: String, + + #[clap(long, value_name = "VALUE", default_value = "60")] + /// Allow space in percentage to spill to local disk. + pub spill_local_disk_max_space_percentage: OrderedFloat, + + #[clap(long, value_name = "VALUE", default_value = "18446744073709551615")] + /// Allow space in bytes to spill to local disk. + pub spill_local_disk_max_bytes: u64, } mod cache_config_converters { @@ -3069,8 +3078,23 @@ mod cache_config_converters { type Error = ErrorCode; fn try_from(value: SpillConfig) -> std::result::Result { + let SpillConfig { + spill_local_disk_path, + spill_local_disk_max_space_percentage, + spill_local_disk_max_bytes, + } = value; + if !spill_local_disk_max_space_percentage.is_normal() + || spill_local_disk_max_space_percentage.is_sign_negative() + || spill_local_disk_max_space_percentage > OrderedFloat(100.0) + { + return Err(ErrorCode::InvalidArgument( + "invalid spill_local_disk_max_space_percentage", + )); + } Ok(Self { - path: value.spill_local_disk_path, + path: spill_local_disk_path, + max_disk_ratio: spill_local_disk_max_space_percentage / 100.0, + global_bytes_limit: spill_local_disk_max_bytes, }) } } @@ -3079,6 +3103,8 @@ mod cache_config_converters { fn from(value: inner::SpillConfig) -> Self { Self { spill_local_disk_path: value.path, + spill_local_disk_max_space_percentage: value.max_disk_ratio * 100.0, + spill_local_disk_max_bytes: value.global_bytes_limit, } } } diff --git a/src/query/config/src/inner.rs b/src/query/config/src/inner.rs index 050347314884..296e2cb1db4d 100644 --- a/src/query/config/src/inner.rs +++ b/src/query/config/src/inner.rs @@ -22,6 +22,7 @@ use std::time::Duration; use databend_common_base::base::mask_string; use databend_common_base::base::GlobalUniqName; +use databend_common_base::base::OrderedFloat; use databend_common_exception::ErrorCode; use databend_common_exception::Result; use databend_common_grpc::RpcClientConf; @@ -144,6 +145,7 @@ impl Debug for InnerConfig { .field("storage", &self.storage) .field("catalogs", &self.catalogs) .field("cache", &self.cache) + .field("spill", &self.spill) .field("background", &self.background) .finish() } @@ -709,12 +711,20 @@ impl Default for CacheConfig { pub struct SpillConfig { /// Path of spill to local disk. disable if it's empty. pub path: String, + + /// Allow ratio use of disk space. + pub max_disk_ratio: OrderedFloat, + + /// Allow bytes use of disk space. + pub global_bytes_limit: u64, } impl Default for SpillConfig { fn default() -> Self { Self { path: "./.databend/temp/_query_spill".to_string(), + max_disk_ratio: OrderedFloat(0.6), + global_bytes_limit: u64::MAX, } } } diff --git a/src/query/service/src/global_services.rs b/src/query/service/src/global_services.rs index a6194387c3b3..c40a4ebb8601 100644 --- a/src/query/service/src/global_services.rs +++ b/src/query/service/src/global_services.rs @@ -147,10 +147,7 @@ impl GlobalServices { &config.query.max_server_memory_usage, config.query.tenant_id.tenant_name().to_string(), )?; - TempDirManager::init( - &config.spill, - config.query.tenant_id.tenant_name().to_string(), - )?; + TempDirManager::init(&config.spill, config.query.tenant_id.tenant_name())?; if let Some(addr) = config.query.cloud_control_grpc_server_address.clone() { CloudControlApiProvider::init(addr, config.query.cloud_control_grpc_timeout).await?; diff --git a/src/query/service/src/interpreters/hook/vacuum_hook.rs b/src/query/service/src/interpreters/hook/vacuum_hook.rs index 654b85bec3bd..9360e8ed8086 100644 --- a/src/query/service/src/interpreters/hook/vacuum_hook.rs +++ b/src/query/service/src/interpreters/hook/vacuum_hook.rs @@ -12,8 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::fs::remove_dir_all; -use std::io::ErrorKind; use std::sync::Arc; use std::time::Duration; @@ -72,13 +70,5 @@ pub fn hook_vacuum_temp_files(query_ctx: &Arc) -> Result<()> { } pub fn hook_disk_temp_dir(query_ctx: &Arc) -> Result<()> { - if let Some(cfg) = TempDirManager::instance().get_disk_spill_config() { - let root = cfg.path.join(query_ctx.get_id()); - if let Err(e) = remove_dir_all(root) { - if !matches!(e.kind(), ErrorKind::NotFound) { - return Err(e.into()); - } - } - } - Ok(()) + TempDirManager::instance().drop_disk_spill_dir(&query_ctx.get_id()) } diff --git a/src/query/service/src/pipelines/builders/builder_window.rs b/src/query/service/src/pipelines/builders/builder_window.rs index 4b7e579e9b05..0a191e79292c 100644 --- a/src/query/service/src/pipelines/builders/builder_window.rs +++ b/src/query/service/src/pipelines/builders/builder_window.rs @@ -32,7 +32,6 @@ use crate::pipelines::processors::transforms::WindowFunctionInfo; use crate::pipelines::processors::transforms::WindowSpillSettings; use crate::pipelines::processors::TransformWindow; use crate::pipelines::PipelineBuilder; -use crate::spillers::DiskSpill; impl PipelineBuilder { pub(crate) fn build_window(&mut self, window: &Window) -> Result<()> { @@ -175,10 +174,6 @@ impl PipelineBuilder { } self.main_pipeline.reorder_inputs(rule); - let max_block_size = settings.get_max_block_size()? as usize; - let sort_block_size = settings.get_window_partition_sort_block_size()? as usize; - let sort_spilling_batch_bytes = settings.get_sort_spilling_batch_bytes()?; - let enable_loser_tree = settings.get_enable_loser_tree_merge_sort()?; let have_order_col = window_partition.after_exchange.unwrap_or(false); let sort_desc = window_partition .order_by @@ -194,12 +189,9 @@ impl PipelineBuilder { }) .collect::>>()?; let disk_bytes_limit = settings.get_window_partition_spilling_to_disk_bytes_limit()?; - let disk_spill = TempDirManager::instance() - .get_disk_spill_config() - .map(|cfg| { - let root = cfg.path.join(self.ctx.get_id()); - DiskSpill::new(root, disk_bytes_limit as isize) - }); + let disk_spill = + TempDirManager::instance().get_disk_spill_dir(disk_bytes_limit, &self.ctx.get_id()); + let window_spill_settings = WindowSpillSettings::new(&settings, num_processors)?; // 3. Build window partition collect processors. @@ -207,6 +199,7 @@ impl PipelineBuilder { for processor_id in 0..num_processors { let processor = TransformWindowPartitionCollect::new( self.ctx.clone(), + &settings, processor_id, num_processors, num_partitions, @@ -214,10 +207,6 @@ impl PipelineBuilder { disk_spill.clone(), sort_desc.clone(), plan_schema.clone(), - max_block_size, - sort_block_size, - sort_spilling_batch_bytes, - enable_loser_tree, have_order_col, )?; pipe_items.push(processor.into_pipe_item()); diff --git a/src/query/service/src/pipelines/processors/transforms/window/partition/transform_window_partition_collect.rs b/src/query/service/src/pipelines/processors/transforms/window/partition/transform_window_partition_collect.rs index c9c8d9ea0117..3643cb52de2d 100644 --- a/src/query/service/src/pipelines/processors/transforms/window/partition/transform_window_partition_collect.rs +++ b/src/query/service/src/pipelines/processors/transforms/window/partition/transform_window_partition_collect.rs @@ -33,14 +33,15 @@ use databend_common_pipeline_core::processors::ProcessorPtr; use databend_common_pipeline_core::query_spill_prefix; use databend_common_pipeline_core::PipeItem; use databend_common_pipeline_transforms::processors::sort_merge; +use databend_common_settings::Settings; use databend_common_storage::DataOperator; use databend_common_storages_fuse::TableContext; +use databend_storages_common_cache::TempDir; use super::WindowPartitionBuffer; use super::WindowPartitionMeta; use super::WindowSpillSettings; use crate::sessions::QueryContext; -use crate::spillers::DiskSpill; use crate::spillers::Spiller; use crate::spillers::SpillerConfig; use crate::spillers::SpillerType; @@ -90,20 +91,16 @@ pub struct TransformWindowPartitionCollect { } impl TransformWindowPartitionCollect { - #[allow(clippy::too_many_arguments)] pub fn new( ctx: Arc, + settings: &Settings, processor_id: usize, num_processors: usize, num_partitions: usize, spill_settings: WindowSpillSettings, - disk_spill: Option>, + disk_spill: Option>, sort_desc: Vec, schema: DataSchemaRef, - max_block_size: usize, - sort_block_size: usize, - sort_spilling_batch_bytes: usize, - enable_loser_tree: bool, have_order_col: bool, ) -> Result { let inputs = (0..num_processors).map(|_| InputPort::create()).collect(); @@ -128,12 +125,17 @@ impl TransformWindowPartitionCollect { // Create an inner `Spiller` to spill data. let operator = DataOperator::instance().operator(); - let spiller = Spiller::create(ctx.clone(), operator, spill_config)?; + let spiller = Spiller::create(ctx, operator, spill_config)?; // Create the window partition buffer. + let sort_block_size = settings.get_window_partition_sort_block_size()? as usize; let buffer = WindowPartitionBuffer::new(spiller, num_partitions, sort_block_size, spill_settings)?; + let max_block_size = settings.get_max_block_size()? as usize; + let enable_loser_tree = settings.get_enable_loser_tree_merge_sort()?; + let sort_spilling_batch_bytes = settings.get_sort_spilling_batch_bytes()?; + Ok(Self { inputs, output, diff --git a/src/query/service/src/spillers/mod.rs b/src/query/service/src/spillers/mod.rs index e27e973fad4f..31d420849041 100644 --- a/src/query/service/src/spillers/mod.rs +++ b/src/query/service/src/spillers/mod.rs @@ -17,7 +17,6 @@ mod spiller; pub use partition_buffer::PartitionBuffer; pub use partition_buffer::PartitionBufferFetchOption; -pub use spiller::DiskSpill; pub use spiller::Location; pub use spiller::SpilledData; pub use spiller::Spiller; diff --git a/src/query/service/src/spillers/spiller.rs b/src/query/service/src/spillers/spiller.rs index 164a0312ef86..4b603541ca53 100644 --- a/src/query/service/src/spillers/spiller.rs +++ b/src/query/service/src/spillers/spiller.rs @@ -16,14 +16,9 @@ use std::collections::HashMap; use std::collections::HashSet; use std::fmt::Display; use std::fmt::Formatter; -use std::fs::create_dir; use std::io; use std::ops::Range; -use std::path::Path; -use std::path::PathBuf; use std::sync::Arc; -use std::sync::Mutex; -use std::sync::Once; use std::time::Instant; use databend_common_base::base::dma_read_file; @@ -38,6 +33,8 @@ use databend_common_exception::Result; use databend_common_expression::arrow::deserialize_column; use databend_common_expression::arrow::serialize_column; use databend_common_expression::DataBlock; +use databend_storages_common_cache::TempDir; +use databend_storages_common_cache::TempPath; use opendal::Operator; use crate::sessions::QueryContext; @@ -68,7 +65,7 @@ impl Display for SpillerType { #[derive(Clone)] pub struct SpillerConfig { pub location_prefix: String, - pub disk_spill: Option>, + pub disk_spill: Option>, pub spiller_type: SpillerType, } @@ -83,7 +80,7 @@ pub struct Spiller { ctx: Arc, operator: Operator, location_prefix: String, - disk_spill: Option>, + disk_spill: Option>, _spiller_type: SpillerType, pub join_spilling_partition_bits: usize, /// 1 partition -> N partition files @@ -217,7 +214,8 @@ impl Spiller { let data = match location { Location::Storage(loc) => self.operator.read(loc).await?.to_bytes(), Location::Disk(path) => { - let cap = columns_layout.iter().sum(); + let cap = path.size(); + debug_assert_eq!(cap, columns_layout.iter().sum::()); let mut data = Vec::with_capacity(cap); dma_read_file(path, &mut data).await?; data.into() @@ -264,11 +262,15 @@ impl Spiller { let data = match location { Location::Storage(loc) => self.operator.read(loc).await?.to_bytes(), Location::Disk(path) => { - let cap = if let Some((_, range, _)) = partitions.last() { - range.end - } else { - 0 - }; + let cap = path.size(); + debug_assert_eq!( + cap, + if let Some((_, range, _)) = partitions.last() { + range.end + } else { + 0 + } + ); let mut data = Vec::with_capacity(cap); dma_read_file(path, &mut data).await?; @@ -324,23 +326,14 @@ impl Spiller { } async fn write_encodes(&mut self, size: usize, blocks: Vec) -> Result { - let unique_name = GlobalUniqName::unique(); let location = match &self.disk_spill { None => None, - Some(disk) => { - if disk.can_write(size as isize) { - disk.init()?; - Some(Location::Disk( - disk.root.join(unique_name.clone()).into_boxed_path(), - )) - } else { - None - } - } + Some(disk) => disk.new_file_with_size(size)?.map(Location::Disk), } .unwrap_or(Location::Storage(format!( - "{}/{unique_name}", - self.location_prefix + "{}/{}", + self.location_prefix, + GlobalUniqName::unique(), ))); let written = match &location { @@ -367,7 +360,7 @@ impl Spiller { .map(|data| io::IoSlice::new(data)) .collect::>(); - dma_write_file_vectored(path, &bufs).await? + dma_write_file_vectored(path.as_ref(), &bufs).await? } }; debug_assert_eq!(size, written); @@ -390,7 +383,7 @@ pub enum SpilledData { #[derive(Debug, Clone, Hash, PartialEq, Eq)] pub enum Location { Storage(String), - Disk(Box), + Disk(TempPath), } pub struct EncodedBlock(pub Vec>); @@ -432,44 +425,6 @@ pub fn deserialize_block(columns_layout: &[usize], mut data: &[u8]) -> DataBlock DataBlock::new_from_columns(columns) } -pub struct DiskSpill { - pub root: PathBuf, - pub bytes_limit: Mutex, - inited: Once, -} - -impl DiskSpill { - pub fn new(root: PathBuf, limit: isize) -> Arc { - Arc::new(DiskSpill { - root, - bytes_limit: Mutex::new(limit), - inited: Once::new(), - }) - } - - pub fn can_write(&self, size: isize) -> bool { - let mut guard = self.bytes_limit.lock().unwrap(); - if *guard > size { - *guard -= size; - true - } else { - false - } - } - - pub fn init(&self) -> Result<()> { - let mut rt = Ok(()); - self.inited.call_once(|| { - if let Err(e) = create_dir(&self.root) { - if !matches!(e.kind(), io::ErrorKind::AlreadyExists) { - rt = Err(e); - } - } - }); - Ok(rt?) - } -} - pub fn record_write_profile(start: &Instant, write_bytes: usize) { Profile::record_usize_profile(ProfileStatisticsName::SpillWriteCount, 1); Profile::record_usize_profile(ProfileStatisticsName::SpillWriteBytes, write_bytes); diff --git a/src/query/storages/common/cache/Cargo.toml b/src/query/storages/common/cache/Cargo.toml index d8c5b0bc4b08..8586b589ae10 100644 --- a/src/query/storages/common/cache/Cargo.toml +++ b/src/query/storages/common/cache/Cargo.toml @@ -32,6 +32,7 @@ log = { workspace = true } parking_lot = { workspace = true } rayon = "1.9.0" siphasher = "0.3.10" +rustix = "0.38.37" [dev-dependencies] tempfile = "3.4.0" diff --git a/src/query/storages/common/cache/src/lib.rs b/src/query/storages/common/cache/src/lib.rs index 114eb3513f4a..8c70b627aeac 100644 --- a/src/query/storages/common/cache/src/lib.rs +++ b/src/query/storages/common/cache/src/lib.rs @@ -14,13 +14,13 @@ #![feature(write_all_vectored)] #![feature(associated_type_defaults)] +#![feature(assert_matches)] mod cache; mod caches; mod manager; mod providers; mod read; -mod temp; mod temp_dir; pub use cache::CacheAccessor; @@ -47,5 +47,4 @@ pub use read::InMemoryCacheReader; pub use read::InMemoryItemCacheReader; pub use read::LoadParams; pub use read::Loader; -pub use temp_dir::TempDir; -pub use temp_dir::TempDirManager; +pub use temp_dir::*; diff --git a/src/query/storages/common/cache/src/temp.rs b/src/query/storages/common/cache/src/temp.rs deleted file mode 100644 index e34b70a34bf0..000000000000 --- a/src/query/storages/common/cache/src/temp.rs +++ /dev/null @@ -1,138 +0,0 @@ -// Copyright 2021 Datafuse Labs -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::collections::hash_map::Entry; -use std::collections::HashMap; -use std::ops::Drop; -use std::path::Path; -use std::sync::atomic::AtomicUsize; -use std::sync::atomic::Ordering; -use std::sync::Arc; -use std::sync::Mutex; - -use databend_common_base::base::GlobalUniqName; - -pub struct TempFileManager { - root: Box, - total_limit: usize, - dir_limit: usize, - _reserved: usize, - - group: Mutex, -} - -struct Group { - dirs: HashMap, Arc>, -} - -impl Group { - fn size(&self) -> usize { - self.dirs.values().map(|v| *v.size.lock().unwrap()).sum() - } -} - -#[derive(Debug, Default)] -struct DirInfo { - count: AtomicUsize, - size: Mutex, -} - -impl TempFileManager { - pub fn get_dir(self: &Arc, id: &str) -> TempDir { - let path = self.root.join(id).into_boxed_path(); - - let mut group = self.group.lock().unwrap(); - - match group.dirs.entry(path.clone()) { - Entry::Occupied(o) => TempDir { - path, - dir_info: o.get().clone(), - manager: self.clone(), - }, - Entry::Vacant(v) => { - let dir_info = Arc::new(DirInfo::default()); - v.insert(dir_info.clone()); - TempDir { - path, - dir_info, - manager: self.clone(), - } - } - } - } - - // pub fn used(&self,size :usize) -> Result { - // let stat = rustix::fs::statvfs(self.root)?; - // stat.f_bavail > self.reserved + (size +stat.f_frsize-1)/stat.f_frsize - // } -} - -pub struct TempDir { - path: Box, - dir_info: Arc, - manager: Arc, -} - -impl TempDir { - pub fn new_file_with_size(&self, size: usize) -> Option { - let path = self.path.join(GlobalUniqName::unique()).into_boxed_path(); - - let dir_info = self.dir_info.clone(); - - let group = self.manager.group.lock().unwrap(); - let mut dir_size = dir_info.size.lock().unwrap(); - if self.manager.dir_limit < *dir_size + size - || self.manager.total_limit < group.size() + size - { - return None; - } - - *dir_size += size; - drop(dir_size); - - dir_info.count.fetch_add(1, Ordering::SeqCst); - Some(TempFile { - path, - size, - dir_info, - }) - } -} - -#[derive(Debug)] -pub struct TempFile { - path: Box, - size: usize, - dir_info: Arc, -} - -impl Drop for TempFile { - fn drop(&mut self) { - self.dir_info.count.fetch_sub(1, Ordering::SeqCst); - - let mut guard = self.dir_info.size.lock().unwrap(); - *guard -= self.size; - - let _ = std::fs::remove_file(&self.path); - } -} - -#[cfg(test)] -mod tests { - - #[test] - fn test_xxx() { - println!("aa") - } -} diff --git a/src/query/storages/common/cache/src/temp_dir.rs b/src/query/storages/common/cache/src/temp_dir.rs index 23c2b0649199..52dd8cd7108e 100644 --- a/src/query/storages/common/cache/src/temp_dir.rs +++ b/src/query/storages/common/cache/src/temp_dir.rs @@ -12,33 +12,73 @@ // See the License for the specific language governing permissions and // limitations under the License. +use std::collections::hash_map::Entry; +use std::collections::HashMap; +use std::fmt::Debug; +use std::fs::create_dir; use std::fs::create_dir_all; use std::fs::remove_dir_all; +use std::hash::Hash; use std::io::ErrorKind; +use std::ops::Deref; +use std::ops::Drop; +use std::path::Path; use std::path::PathBuf; +use std::sync::atomic::AtomicUsize; +use std::sync::atomic::Ordering; use std::sync::Arc; +use std::sync::Mutex; +use std::sync::Once; use databend_common_base::base::GlobalInstance; +use databend_common_base::base::GlobalUniqName; use databend_common_config::SpillConfig; +use databend_common_exception::ErrorCode; use databend_common_exception::Result; +use rustix::fs::statvfs; pub struct TempDirManager { - disk_spill_config: Option, + root: Option>, + + // Global limit in bytes + global_limit: usize, + // Reserved disk space in blocks + reserved: u64, + + group: Mutex, } impl TempDirManager { - pub fn init(config: &SpillConfig, tenant_id: String) -> Result<()> { - let disk_spill_config = if config.path.is_empty() { - None + pub fn init(config: &SpillConfig, tenant_id: &str) -> Result<()> { + let (root, reserved) = if config.path.is_empty() { + (None, 0) } else { - let path = PathBuf::from(&config.path).join(tenant_id.clone()); + let path = PathBuf::from(&config.path) + .join(tenant_id) + .into_boxed_path(); + + if let Err(e) = remove_dir_all(&path) { + if !matches!(e.kind(), ErrorKind::NotFound) { + Err(e)?; + } + } + + create_dir_all(&path)?; + + let stat = statvfs(path.as_ref()).map_err(|e| ErrorCode::Internal(e.to_string()))?; + let reserved = (stat.f_blocks as f64 * (1.0 - *config.max_disk_ratio)) as u64; - let temp_dir = TempDir { path }; - temp_dir.init()?; - Some(temp_dir) + (Some(path), reserved) }; - GlobalInstance::set(Arc::new(Self { disk_spill_config })); + GlobalInstance::set(Arc::new(Self { + root, + global_limit: config.global_bytes_limit as usize, + reserved, + group: Mutex::new(Group { + dirs: HashMap::new(), + }), + })); Ok(()) } @@ -46,24 +86,245 @@ impl TempDirManager { GlobalInstance::get() } - pub fn get_disk_spill_config(&self) -> Option { - self.disk_spill_config.clone() + pub fn get_disk_spill_dir( + self: &Arc, + limit: usize, + query_id: &str, + ) -> Option> { + self.root.as_ref()?; + + let path = self.root.as_ref().unwrap().join(query_id).into_boxed_path(); + let mut group = self.group.lock().unwrap(); + let dir = match group.dirs.entry(path.clone()) { + Entry::Occupied(o) => TempDir { + path, + dir_info: o.get().clone(), + manager: self.clone(), + }, + Entry::Vacant(v) => { + let dir_info = Arc::new(DirInfo { + limit, + count: Default::default(), + size: Default::default(), + inited: Once::new(), + }); + v.insert(dir_info.clone()); + TempDir { + path, + dir_info, + manager: self.clone(), + } + } + }; + Some(Arc::new(dir)) + } + + pub fn drop_disk_spill_dir(self: &Arc, query_id: &str) -> Result<()> { + let path = self.root.as_ref().unwrap().join(query_id).into_boxed_path(); + let mut group = self.group.lock().unwrap(); + if group.dirs.remove(&path).is_some() { + if let Err(e) = remove_dir_all(&path) { + if !matches!(e.kind(), ErrorKind::NotFound) { + Err(e)?; + } + } + } + Ok(()) + } + + fn insufficient_disk(&self, size: u64) -> Result { + let stat = statvfs(self.root.as_ref().unwrap().as_ref()) + .map_err(|e| ErrorCode::Internal(e.to_string()))?; + Ok(stat.f_bavail < self.reserved + (size + stat.f_frsize - 1) / stat.f_frsize) + } +} + +struct Group { + dirs: HashMap, Arc>, +} + +impl Group { + fn size(&self) -> usize { + self.dirs.values().map(|v| *v.size.lock().unwrap()).sum() } } #[derive(Clone)] pub struct TempDir { - pub path: PathBuf, + path: Box, + dir_info: Arc, + manager: Arc, } impl TempDir { - fn init(&self) -> Result<()> { - if let Err(e) = remove_dir_all(&self.path) { - if !matches!(e.kind(), ErrorKind::NotFound) { - Err(e)?; - } + pub fn new_file_with_size(&self, size: usize) -> Result> { + let path = self.path.join(GlobalUniqName::unique()).into_boxed_path(); + + if self.dir_info.limit < *self.dir_info.size.lock().unwrap() + size + || self.manager.global_limit < self.manager.group.lock().unwrap().size() + size + || self.manager.insufficient_disk(size as u64)? + { + return Ok(None); + } + + let mut dir_size = self.dir_info.size.lock().unwrap(); + if self.dir_info.limit < *dir_size + size { + return Ok(None); } - Ok(create_dir_all(&self.path)?) + *dir_size += size; + drop(dir_size); + + self.init_dir()?; + + let dir_info = self.dir_info.clone(); + dir_info.count.fetch_add(1, Ordering::SeqCst); + + Ok(Some(TempPath(Arc::new(InnerPath { + path, + size, + dir_info, + })))) + } + + fn init_dir(&self) -> Result<()> { + let mut rt = Ok(()); + self.dir_info.inited.call_once(|| { + if let Err(e) = create_dir(&self.path) { + if !matches!(e.kind(), ErrorKind::AlreadyExists) { + rt = Err(e); + } + } + }); + Ok(rt?) + } +} + +struct DirInfo { + limit: usize, + count: AtomicUsize, + size: Mutex, + inited: Once, +} + +impl Debug for DirInfo { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("DirInfo") + .field("limit", &self.limit) + .field("count", &self.count) + .field("size", &self.size) + .field("inited", &self.inited.is_completed()) + .finish() + } +} + +#[derive(Clone)] +pub struct TempPath(Arc); + +impl Debug for TempPath { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("TempPath") + .field("path", &self.0.path) + .field("size", &self.0.size) + .field("dir_info", &self.0.dir_info) + .finish() + } +} + +impl Hash for TempPath { + fn hash(&self, state: &mut H) { + self.0.path.hash(state); + } +} + +impl PartialEq for TempPath { + fn eq(&self, other: &Self) -> bool { + self.0.path == other.0.path + } +} + +impl Eq for TempPath {} + +impl AsRef for TempPath { + fn as_ref(&self) -> &Path { + self.0.path.as_ref() + } +} + +impl Deref for TempPath { + type Target = Path; + + fn deref(&self) -> &Path { + self.as_ref() + } +} + +impl TempPath { + pub fn size(&self) -> usize { + self.0.size + } +} + +struct InnerPath { + path: Box, + size: usize, + dir_info: Arc, +} + +impl Drop for InnerPath { + fn drop(&mut self) { + let _ = std::fs::remove_file(&self.path); + + self.dir_info.count.fetch_sub(1, Ordering::SeqCst); + let mut guard = self.dir_info.size.lock().unwrap(); + *guard -= self.size; + } +} + +#[cfg(test)] +mod tests { + use std::assert_matches::assert_matches; + use std::fs; + use std::sync::atomic::Ordering; + + use super::*; + + #[test] + fn test_temp_dir() -> Result<()> { + let thread = std::thread::current(); + GlobalInstance::init_testing(thread.name().unwrap()); + + let config = SpillConfig { + path: "test_data".to_string(), + max_disk_ratio: 0.99.into(), + global_bytes_limit: 1 << 30, + }; + + TempDirManager::init(&config, "test_tenant")?; + + let mgr = TempDirManager::instance(); + let dir = mgr.get_disk_spill_dir(1 << 30, "some_query").unwrap(); + let path = dir.new_file_with_size(100)?.unwrap(); + + println!("{:?}", &path); + + fs::write(&path, vec![b'a'; 100])?; + + assert_eq!(1, dir.dir_info.count.load(Ordering::Relaxed)); + assert_eq!(100, *dir.dir_info.size.lock().unwrap()); + + let path_str = path.as_ref().to_str().unwrap().to_string(); + drop(path); + + assert_eq!(0, dir.dir_info.count.load(Ordering::Relaxed)); + assert_eq!(0, *dir.dir_info.size.lock().unwrap()); + + assert_matches!(fs::read_to_string(path_str), Err(_)); + + mgr.drop_disk_spill_dir("some_query")?; + + remove_dir_all("test_data")?; + + Ok(()) } } From 39809e1f2dab7cc422115270d5f29133a2c3fffe Mon Sep 17 00:00:00 2001 From: coldWater Date: Fri, 20 Sep 2024 20:59:02 +0800 Subject: [PATCH 20/40] fix Signed-off-by: coldWater --- src/query/storages/common/cache/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/query/storages/common/cache/Cargo.toml b/src/query/storages/common/cache/Cargo.toml index 8586b589ae10..7cb83f2577ee 100644 --- a/src/query/storages/common/cache/Cargo.toml +++ b/src/query/storages/common/cache/Cargo.toml @@ -31,8 +31,8 @@ hex = "0.4.3" log = { workspace = true } parking_lot = { workspace = true } rayon = "1.9.0" -siphasher = "0.3.10" rustix = "0.38.37" +siphasher = "0.3.10" [dev-dependencies] tempfile = "3.4.0" From da387b3c7024c7266713c67fff06891901234d28 Mon Sep 17 00:00:00 2001 From: coldWater Date: Tue, 24 Sep 2024 19:06:27 +0800 Subject: [PATCH 21/40] drop_disk_spill_dir_unknown Signed-off-by: coldWater --- .../storages/common/cache/src/temp_dir.rs | 68 +++++++++++++++++++ 1 file changed, 68 insertions(+) diff --git a/src/query/storages/common/cache/src/temp_dir.rs b/src/query/storages/common/cache/src/temp_dir.rs index 52dd8cd7108e..49059475c193 100644 --- a/src/query/storages/common/cache/src/temp_dir.rs +++ b/src/query/storages/common/cache/src/temp_dir.rs @@ -132,6 +132,38 @@ impl TempDirManager { Ok(()) } + pub fn drop_disk_spill_dir_unknown( + self: &Arc, + limit: usize, + ) -> Result>> { + match self.root.as_ref() { + None => Ok(vec![]), + Some(root) => { + let read_dir = std::fs::read_dir(root)?; + let group = self.group.lock().unwrap(); + let to_delete = read_dir + .filter_map(|entry| match entry { + Err(_) => None, + Ok(entry) => { + let path = entry.path().into_boxed_path(); + if !group.dirs.contains_key(&path) { + Some(path) + } else { + None + } + } + }) + .take(limit) + .collect::>(); + drop(group); + for path in &to_delete { + remove_dir_all(path)?; + } + Ok(to_delete) + } + } + } + fn insufficient_disk(&self, size: u64) -> Result { let stat = statvfs(self.root.as_ref().unwrap().as_ref()) .map_err(|e| ErrorCode::Internal(e.to_string()))?; @@ -327,4 +359,40 @@ mod tests { Ok(()) } + + #[test] + fn test_drop_disk_spill_dir_unknown() -> Result<()> { + let thread = std::thread::current(); + GlobalInstance::init_testing(thread.name().unwrap()); + + let config = SpillConfig { + path: "test_data2".to_string(), + max_disk_ratio: 0.99.into(), + global_bytes_limit: 1 << 30, + }; + + TempDirManager::init(&config, "test_tenant")?; + + let mgr = TempDirManager::instance(); + mgr.get_disk_spill_dir(1 << 30, "some_query").unwrap(); + + create_dir("test_data2/test_tenant/unknown_query1")?; + create_dir("test_data2/test_tenant/unknown_query2")?; + + let mut deleted = mgr.drop_disk_spill_dir_unknown(10)?; + + deleted.sort(); + + assert_eq!( + vec![ + PathBuf::from("test_data2/test_tenant/unknown_query1").into_boxed_path(), + PathBuf::from("test_data2/test_tenant/unknown_query2").into_boxed_path(), + ], + deleted + ); + + remove_dir_all("test_data2")?; + + Ok(()) + } } From 4344eea590ec5fb5e83e1906c233855ccc6746a2 Mon Sep 17 00:00:00 2001 From: coldWater Date: Tue, 24 Sep 2024 20:03:11 +0800 Subject: [PATCH 22/40] drop residual temp dir Signed-off-by: coldWater --- .../src/interpreters/hook/vacuum_hook.rs | 13 +++++++++- .../storages/common/cache/src/temp_dir.rs | 24 +++++++++++-------- 2 files changed, 26 insertions(+), 11 deletions(-) diff --git a/src/query/service/src/interpreters/hook/vacuum_hook.rs b/src/query/service/src/interpreters/hook/vacuum_hook.rs index 427ae5cad48d..94702eb34dd8 100644 --- a/src/query/service/src/interpreters/hook/vacuum_hook.rs +++ b/src/query/service/src/interpreters/hook/vacuum_hook.rs @@ -24,7 +24,9 @@ use databend_common_pipeline_core::query_spill_prefix; use databend_common_storage::DataOperator; use databend_enterprise_vacuum_handler::get_vacuum_handler; use databend_storages_common_cache::TempDirManager; +use log::warn; use opendal::Buffer; +use rand::Rng; use crate::sessions::QueryContext; @@ -68,5 +70,14 @@ pub fn hook_vacuum_temp_files(query_ctx: &Arc) -> Result<()> { } pub fn hook_disk_temp_dir(query_ctx: &Arc) -> Result<()> { - TempDirManager::instance().drop_disk_spill_dir(&query_ctx.get_id()) + let mgr = TempDirManager::instance(); + + if mgr.drop_disk_spill_dir(&query_ctx.get_id())? && rand::thread_rng().gen_ratio(1, 10) { + let deleted = mgr.drop_disk_spill_dir_unknown(5)?; + if !deleted.is_empty() { + warn!("Deleted residual temporary directories: {:?}", deleted) + } + } + + Ok(()) } diff --git a/src/query/storages/common/cache/src/temp_dir.rs b/src/query/storages/common/cache/src/temp_dir.rs index 49059475c193..faa05be05cac 100644 --- a/src/query/storages/common/cache/src/temp_dir.rs +++ b/src/query/storages/common/cache/src/temp_dir.rs @@ -119,17 +119,21 @@ impl TempDirManager { Some(Arc::new(dir)) } - pub fn drop_disk_spill_dir(self: &Arc, query_id: &str) -> Result<()> { - let path = self.root.as_ref().unwrap().join(query_id).into_boxed_path(); + pub fn drop_disk_spill_dir(self: &Arc, query_id: &str) -> Result { + let path = match self.root.as_ref() { + None => return Ok(false), + Some(root) => root.join(query_id).into_boxed_path(), + }; + let mut group = self.group.lock().unwrap(); if group.dirs.remove(&path).is_some() { - if let Err(e) = remove_dir_all(&path) { - if !matches!(e.kind(), ErrorKind::NotFound) { - Err(e)?; - } + match remove_dir_all(&path) { + Ok(_) => return Ok(true), + Err(e) if matches!(e.kind(), ErrorKind::NotFound) => {} + res => res?, } } - Ok(()) + Ok(false) } pub fn drop_disk_spill_dir_unknown( @@ -146,10 +150,10 @@ impl TempDirManager { Err(_) => None, Ok(entry) => { let path = entry.path().into_boxed_path(); - if !group.dirs.contains_key(&path) { - Some(path) - } else { + if group.dirs.contains_key(&path) { None + } else { + Some(path) } } }) From 2cbf13fdf063eb880265e997134f8a290c4dcd1e Mon Sep 17 00:00:00 2001 From: coldWater Date: Fri, 27 Sep 2024 12:07:13 +0800 Subject: [PATCH 23/40] no copy read Signed-off-by: coldWater --- src/common/base/src/base/dma.rs | 78 +++++++++++++++---- src/query/service/src/spillers/spiller.rs | 63 ++++++++------- .../service/tests/it/spillers/spiller.rs | 2 +- 3 files changed, 99 insertions(+), 44 deletions(-) diff --git a/src/common/base/src/base/dma.rs b/src/common/base/src/base/dma.rs index 6668a5c4323b..7c582c6e430b 100644 --- a/src/common/base/src/base/dma.rs +++ b/src/common/base/src/base/dma.rs @@ -119,6 +119,17 @@ impl Drop for DmaBuffer { } } +impl From for Vec { + fn from(mut val: DmaBuffer) -> Self { + let length = val.len; + let cap = val.cap; + + let v = unsafe { Vec::from_raw_parts(val.as_mut_ptr(), length, cap) }; + std::mem::forget(val); + v + } +} + /// A `DmaFile` is similar to a `File`, but it is opened with the `O_DIRECT` file in order to /// perform direct IO. struct DmaFile { @@ -186,7 +197,9 @@ impl DmaFile { let buf = self.buffer(); match rustix::io::write(&self.fd, buf) { Ok(n) => { - debug_assert_eq!(n, buf.len()); + if n != buf.len() { + return Err(io::Error::new(io::ErrorKind::Other, "short write")); + } unsafe { self.mut_buffer().set_len(0) }; Ok(n) } @@ -194,13 +207,17 @@ impl DmaFile { } } - fn read_direct(&mut self) -> io::Result { + fn read_direct(&mut self, n: usize) -> io::Result { let Self { fd, buf, .. } = self; let buf = buf.as_mut().unwrap(); - unsafe { buf.set_len(buf.capacity()) }; - match rustix::io::read(fd, buf) { + if n > buf.remaining() { + return Err(io::Error::new(io::ErrorKind::Other, "buf not sufficient")); + } + let start = buf.len(); + unsafe { buf.set_len(buf.len() + n) }; + match rustix::io::read(fd, &mut (*buf)[start..]) { Ok(n) => { - unsafe { buf.set_len(n) }; + unsafe { buf.set_len(start + n) }; Ok(n) } Err(e) => Err(e.into()), @@ -309,8 +326,9 @@ pub async fn dma_read_file( file.set_buffer(buf); let mut n = 0; + let read_n = file.alignment; loop { - file = asyncify(move || file.read_direct().map(|_| file)).await?; + file = asyncify(move || file.read_direct(read_n).map(|_| file)).await?; let buf = file.buffer(); if buf.is_empty() { @@ -342,16 +360,20 @@ pub async fn dma_read_file_range( let buf = DmaBuffer::new(align_end - align_start, file.alignment); file.set_buffer(buf); - let offset = file.seek(SeekFrom::Start(align_start as u64)).await?; - - if offset as usize != align_start { - return Err(io::Error::new( - io::ErrorKind::InvalidInput, - "range out of range", - )); + if align_start != 0 { + let offset = file.seek(SeekFrom::Start(align_start as u64)).await?; + if offset as usize != align_start { + return Err(io::Error::new( + io::ErrorKind::InvalidInput, + "range out of range", + )); + } } - file = asyncify(move || file.read_direct().map(|_| file)).await?; + while file.buffer().remaining() > 0 { + let read_n = file.buffer().remaining(); + file = asyncify(move || file.read_direct(read_n).map(|_| file)).await?; + } let rt_range = range.start as usize - align_start..range.end as usize - align_start; Ok((file.buf.unwrap(), rt_range)) @@ -434,4 +456,32 @@ mod tests { let _ = std::fs::remove_file(filename); } + + #[tokio::test] + async fn test_read_direct() { + let filename = "test_file3"; + let _ = std::fs::remove_file(filename); + let stat = rustix::fs::statvfs(".").unwrap(); + let alignment = 512.max(stat.f_bsize as usize); + let file_size: usize = alignment * 2; + + let want = (0..file_size).map(|i| (i % 256) as u8).collect::>(); + + let bufs = vec![IoSlice::new(&want)]; + dma_write_file_vectored(filename, &bufs).await.unwrap(); + + let mut file = DmaFile::open(filename).await.unwrap(); + let buf = DmaBuffer::new(file_size, file.alignment); + file.set_buffer(buf); + + let got = file.read_direct(alignment).unwrap(); + assert_eq!(alignment, got); + assert_eq!(&want[0..alignment], &**file.buffer()); + + let got = file.read_direct(alignment).unwrap(); + assert_eq!(alignment, got); + assert_eq!(&want, &**file.buffer()); + + let _ = std::fs::remove_file(filename); + } } diff --git a/src/query/service/src/spillers/spiller.rs b/src/query/service/src/spillers/spiller.rs index 4b603541ca53..1a32b1654dda 100644 --- a/src/query/service/src/spillers/spiller.rs +++ b/src/query/service/src/spillers/spiller.rs @@ -21,7 +21,6 @@ use std::ops::Range; use std::sync::Arc; use std::time::Instant; -use databend_common_base::base::dma_read_file; use databend_common_base::base::dma_read_file_range; use databend_common_base::base::dma_write_file_vectored; use databend_common_base::base::GlobalUniqName; @@ -211,22 +210,23 @@ impl Spiller { // Read spilled data from storage. let instant = Instant::now(); - let data = match location { - Location::Storage(loc) => self.operator.read(loc).await?.to_bytes(), - Location::Disk(path) => { - let cap = path.size(); - debug_assert_eq!(cap, columns_layout.iter().sum::()); - let mut data = Vec::with_capacity(cap); - dma_read_file(path, &mut data).await?; - data.into() + let block = match location { + Location::Remote(loc) => { + let data = self.operator.read(loc).await?.to_bytes(); + record_read_profile(&instant, data.len()); + deserialize_block(columns_layout, &data) + } + Location::Local(path) => { + let file_size = path.size(); + debug_assert_eq!(file_size, columns_layout.iter().sum::()); + let (buf, range) = dma_read_file_range(path, 0..file_size as u64).await?; + let data = &buf[range]; + record_read_profile(&instant, data.len()); + deserialize_block(columns_layout, data) } }; - // Record statistics. - record_read_profile(&instant, data.len()); - - // Deserialize data block. - Ok(deserialize_block(columns_layout, &data)) + Ok(block) } #[async_backtrace::framed] @@ -260,11 +260,11 @@ impl Spiller { let instant = Instant::now(); let data = match location { - Location::Storage(loc) => self.operator.read(loc).await?.to_bytes(), - Location::Disk(path) => { - let cap = path.size(); + Location::Remote(loc) => self.operator.read(loc).await?.to_bytes(), + Location::Local(path) => { + let file_size = path.size(); debug_assert_eq!( - cap, + file_size, if let Some((_, range, _)) = partitions.last() { range.end } else { @@ -272,9 +272,14 @@ impl Spiller { } ); - let mut data = Vec::with_capacity(cap); - dma_read_file(path, &mut data).await?; - data.into() + let (mut buf, range) = dma_read_file_range(path, 0..file_size as u64).await?; + assert_eq!(range.start, 0); + unsafe { + buf.set_len(range.end); + } + + let buf: Vec = buf.into(); + buf.into() } }; @@ -306,7 +311,7 @@ impl Spiller { let data_range = data_range.start as u64..data_range.end as u64; match location { - Location::Storage(loc) => { + Location::Remote(loc) => { let data = self .operator .read_with(loc) @@ -316,7 +321,7 @@ impl Spiller { record_read_profile(&instant, data.len()); Ok(deserialize_block(columns_layout, &data)) } - Location::Disk(path) => { + Location::Local(path) => { let (buf, range) = dma_read_file_range(path, data_range).await?; let data = &buf[range]; record_read_profile(&instant, data.len()); @@ -328,16 +333,16 @@ impl Spiller { async fn write_encodes(&mut self, size: usize, blocks: Vec) -> Result { let location = match &self.disk_spill { None => None, - Some(disk) => disk.new_file_with_size(size)?.map(Location::Disk), + Some(disk) => disk.new_file_with_size(size)?.map(Location::Local), } - .unwrap_or(Location::Storage(format!( + .unwrap_or(Location::Remote(format!( "{}/{}", self.location_prefix, GlobalUniqName::unique(), ))); let written = match &location { - Location::Storage(loc) => { + Location::Remote(loc) => { let mut writer = self .operator .writer_with(loc) @@ -353,7 +358,7 @@ impl Spiller { writer.close().await?; written } - Location::Disk(path) => { + Location::Local(path) => { let bufs = blocks .iter() .flat_map(|x| &x.0) @@ -382,8 +387,8 @@ pub enum SpilledData { #[derive(Debug, Clone, Hash, PartialEq, Eq)] pub enum Location { - Storage(String), - Disk(TempPath), + Remote(String), + Local(TempPath), } pub struct EncodedBlock(pub Vec>); diff --git a/src/query/service/tests/it/spillers/spiller.rs b/src/query/service/tests/it/spillers/spiller.rs index b7f2f11ecfec..ad9779a7d615 100644 --- a/src/query/service/tests/it/spillers/spiller.rs +++ b/src/query/service/tests/it/spillers/spiller.rs @@ -57,7 +57,7 @@ async fn test_spill_with_partition() -> Result<()> { assert!(res.is_ok()); let location = &spiller.partition_location.get(&0).unwrap()[0]; - assert_matches!(location, Location::Storage(_)); + assert_matches!(location, Location::Remote(_)); // Test read spilled data let block = DataBlock::concat(&spiller.read_spilled_partition(&(0)).await?)?; From b469495a7f29fb89fcca7a79b94bbe61e4c19589 Mon Sep 17 00:00:00 2001 From: coldWater Date: Fri, 27 Sep 2024 12:07:46 +0800 Subject: [PATCH 24/40] rename reserved_disk_ratio Signed-off-by: coldWater --- src/query/config/src/config.rs | 12 ++++++------ src/query/config/src/inner.rs | 6 +++--- src/query/storages/common/cache/src/temp_dir.rs | 6 +++--- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/src/query/config/src/config.rs b/src/query/config/src/config.rs index 13c8a4bde12d..c1a9dfab71bb 100644 --- a/src/query/config/src/config.rs +++ b/src/query/config/src/config.rs @@ -2946,9 +2946,9 @@ pub struct SpillConfig { )] pub spill_local_disk_path: String, - #[clap(long, value_name = "VALUE", default_value = "60")] - /// Allow space in percentage to spill to local disk. - pub spill_local_disk_max_space_percentage: OrderedFloat, + #[clap(long, value_name = "VALUE", default_value = "30")] + /// Percentage of reserve disk space that won't be used for spill to local disk. + pub spill_local_disk_reserved_space_percentage: OrderedFloat, #[clap(long, value_name = "VALUE", default_value = "18446744073709551615")] /// Allow space in bytes to spill to local disk. @@ -3080,7 +3080,7 @@ mod cache_config_converters { fn try_from(value: SpillConfig) -> std::result::Result { let SpillConfig { spill_local_disk_path, - spill_local_disk_max_space_percentage, + spill_local_disk_reserved_space_percentage: spill_local_disk_max_space_percentage, spill_local_disk_max_bytes, } = value; if !spill_local_disk_max_space_percentage.is_normal() @@ -3093,7 +3093,7 @@ mod cache_config_converters { } Ok(Self { path: spill_local_disk_path, - max_disk_ratio: spill_local_disk_max_space_percentage / 100.0, + reserved_disk_ratio: spill_local_disk_max_space_percentage / 100.0, global_bytes_limit: spill_local_disk_max_bytes, }) } @@ -3103,7 +3103,7 @@ mod cache_config_converters { fn from(value: inner::SpillConfig) -> Self { Self { spill_local_disk_path: value.path, - spill_local_disk_max_space_percentage: value.max_disk_ratio * 100.0, + spill_local_disk_reserved_space_percentage: value.reserved_disk_ratio * 100.0, spill_local_disk_max_bytes: value.global_bytes_limit, } } diff --git a/src/query/config/src/inner.rs b/src/query/config/src/inner.rs index 296e2cb1db4d..9b508d9daf21 100644 --- a/src/query/config/src/inner.rs +++ b/src/query/config/src/inner.rs @@ -712,8 +712,8 @@ pub struct SpillConfig { /// Path of spill to local disk. disable if it's empty. pub path: String, - /// Allow ratio use of disk space. - pub max_disk_ratio: OrderedFloat, + /// Ratio of the reserve of the disk space. + pub reserved_disk_ratio: OrderedFloat, /// Allow bytes use of disk space. pub global_bytes_limit: u64, @@ -723,7 +723,7 @@ impl Default for SpillConfig { fn default() -> Self { Self { path: "./.databend/temp/_query_spill".to_string(), - max_disk_ratio: OrderedFloat(0.6), + reserved_disk_ratio: OrderedFloat(0.3), global_bytes_limit: u64::MAX, } } diff --git a/src/query/storages/common/cache/src/temp_dir.rs b/src/query/storages/common/cache/src/temp_dir.rs index faa05be05cac..a0b289619cba 100644 --- a/src/query/storages/common/cache/src/temp_dir.rs +++ b/src/query/storages/common/cache/src/temp_dir.rs @@ -66,7 +66,7 @@ impl TempDirManager { create_dir_all(&path)?; let stat = statvfs(path.as_ref()).map_err(|e| ErrorCode::Internal(e.to_string()))?; - let reserved = (stat.f_blocks as f64 * (1.0 - *config.max_disk_ratio)) as u64; + let reserved = (stat.f_blocks as f64 * *config.reserved_disk_ratio) as u64; (Some(path), reserved) }; @@ -332,7 +332,7 @@ mod tests { let config = SpillConfig { path: "test_data".to_string(), - max_disk_ratio: 0.99.into(), + reserved_disk_ratio: 0.99.into(), global_bytes_limit: 1 << 30, }; @@ -371,7 +371,7 @@ mod tests { let config = SpillConfig { path: "test_data2".to_string(), - max_disk_ratio: 0.99.into(), + reserved_disk_ratio: 0.99.into(), global_bytes_limit: 1 << 30, }; From 7d672db5659c40226eeb8667f2e1bd7e5cf7c7fa Mon Sep 17 00:00:00 2001 From: coldWater Date: Fri, 27 Sep 2024 12:32:39 +0800 Subject: [PATCH 25/40] record the profile of LocalSpill and RemoteSpill separately Signed-off-by: coldWater --- .../base/src/runtime/profile/profiles.rs | 113 +++++++++++++----- .../serde/transform_aggregate_spill_writer.rs | 18 ++- ...transform_exchange_aggregate_serializer.rs | 18 ++- .../transform_exchange_group_by_serializer.rs | 18 ++- .../serde/transform_group_by_spill_writer.rs | 18 ++- .../serde/transform_spill_reader.rs | 6 +- src/query/service/src/spillers/spiller.rs | 57 ++++++--- 7 files changed, 176 insertions(+), 72 deletions(-) diff --git a/src/common/base/src/runtime/profile/profiles.rs b/src/common/base/src/runtime/profile/profiles.rs index 3f9a4a40d795..6f75ef5d0afc 100644 --- a/src/common/base/src/runtime/profile/profiles.rs +++ b/src/common/base/src/runtime/profile/profiles.rs @@ -37,12 +37,23 @@ pub enum ProfileStatisticsName { ScanBytes, ScanCacheBytes, ScanPartitions, - SpillWriteCount, - SpillWriteBytes, - SpillWriteTime, - SpillReadCount, - SpillReadBytes, - SpillReadTime, + + RemoteSpillWriteCount, + RemoteSpillWriteBytes, + RemoteSpillWriteTime, + + RemoteSpillReadCount, + RemoteSpillReadBytes, + RemoteSpillReadTime, + + LocalSpillWriteCount, + LocalSpillWriteBytes, + LocalSpillWriteTime, + + LocalSpillReadCount, + LocalSpillReadBytes, + LocalSpillReadTime, + RuntimeFilterPruneParts, MemoryUsage, ExternalServerRetryCount, @@ -189,45 +200,87 @@ pub fn get_statistics_desc() -> Arc unit: StatisticsUnit::Count, plain_statistics: true, }), - (ProfileStatisticsName::SpillWriteCount, ProfileDesc { - display_name: "numbers spilled by write", - desc: "The number of spilled by write", - index: ProfileStatisticsName::SpillWriteCount as usize, + (ProfileStatisticsName::RemoteSpillWriteCount, ProfileDesc { + display_name: "numbers remote spilled by write", + desc: "The number of remote spilled by write", + index: ProfileStatisticsName::RemoteSpillWriteCount as usize, + unit: StatisticsUnit::Count, + plain_statistics: true, + }), + (ProfileStatisticsName::RemoteSpillWriteBytes, ProfileDesc { + display_name: "bytes remote spilled by write", + desc: "The bytes remote spilled by write", + index: ProfileStatisticsName::RemoteSpillWriteBytes as usize, + unit: StatisticsUnit::Bytes, + plain_statistics: true, + }), + (ProfileStatisticsName::RemoteSpillWriteTime, ProfileDesc { + display_name: "remote spilled time by write", + desc: "The time spent to write remote spill in millisecond", + index: ProfileStatisticsName::RemoteSpillWriteTime as usize, + unit: StatisticsUnit::MillisSeconds, + plain_statistics: false, + }), + (ProfileStatisticsName::RemoteSpillReadCount, ProfileDesc { + display_name: "numbers remote spilled by read", + desc: "The number of remote spilled by read", + index: ProfileStatisticsName::RemoteSpillReadCount as usize, + unit: StatisticsUnit::Count, + plain_statistics: true, + }), + (ProfileStatisticsName::RemoteSpillReadBytes, ProfileDesc { + display_name: "bytes remote spilled by read", + desc: "The bytes remote spilled by read", + index: ProfileStatisticsName::RemoteSpillReadBytes as usize, + unit: StatisticsUnit::Bytes, + plain_statistics: true, + }), + (ProfileStatisticsName::RemoteSpillReadTime, ProfileDesc { + display_name: "remote spilled time by read", + desc: "The time spent to read remote spill in millisecond", + index: ProfileStatisticsName::RemoteSpillReadTime as usize, + unit: StatisticsUnit::MillisSeconds, + plain_statistics: false, + }), + (ProfileStatisticsName::LocalSpillWriteCount, ProfileDesc { + display_name: "numbers local spilled by write", + desc: "The number of local spilled by write", + index: ProfileStatisticsName::LocalSpillWriteCount as usize, unit: StatisticsUnit::Count, plain_statistics: true, }), - (ProfileStatisticsName::SpillWriteBytes, ProfileDesc { - display_name: "bytes spilled by write", - desc: "The bytes spilled by write", - index: ProfileStatisticsName::SpillWriteBytes as usize, + (ProfileStatisticsName::LocalSpillWriteBytes, ProfileDesc { + display_name: "bytes local spilled by write", + desc: "The bytes local spilled by write", + index: ProfileStatisticsName::LocalSpillWriteBytes as usize, unit: StatisticsUnit::Bytes, plain_statistics: true, }), - (ProfileStatisticsName::SpillWriteTime, ProfileDesc { - display_name: "spilled time by write", - desc: "The time spent to write spill in millisecond", - index: ProfileStatisticsName::SpillWriteTime as usize, + (ProfileStatisticsName::LocalSpillWriteTime, ProfileDesc { + display_name: "local spilled time by write", + desc: "The time spent to write local spill in millisecond", + index: ProfileStatisticsName::LocalSpillWriteTime as usize, unit: StatisticsUnit::MillisSeconds, plain_statistics: false, }), - (ProfileStatisticsName::SpillReadCount, ProfileDesc { - display_name: "numbers spilled by read", - desc: "The number of spilled by read", - index: ProfileStatisticsName::SpillReadCount as usize, + (ProfileStatisticsName::LocalSpillReadCount, ProfileDesc { + display_name: "numbers local spilled by read", + desc: "The number of local spilled by read", + index: ProfileStatisticsName::LocalSpillReadCount as usize, unit: StatisticsUnit::Count, plain_statistics: true, }), - (ProfileStatisticsName::SpillReadBytes, ProfileDesc { - display_name: "bytes spilled by read", - desc: "The bytes spilled by read", - index: ProfileStatisticsName::SpillReadBytes as usize, + (ProfileStatisticsName::LocalSpillReadBytes, ProfileDesc { + display_name: "bytes local spilled by read", + desc: "The bytes local spilled by read", + index: ProfileStatisticsName::LocalSpillReadBytes as usize, unit: StatisticsUnit::Bytes, plain_statistics: true, }), - (ProfileStatisticsName::SpillReadTime, ProfileDesc { - display_name: "spilled time by read", - desc: "The time spent to read spill in millisecond", - index: ProfileStatisticsName::SpillReadTime as usize, + (ProfileStatisticsName::LocalSpillReadTime, ProfileDesc { + display_name: "local spilled time by read", + desc: "The time spent to read local spill in millisecond", + index: ProfileStatisticsName::LocalSpillReadTime as usize, unit: StatisticsUnit::MillisSeconds, plain_statistics: false, }), diff --git a/src/query/service/src/pipelines/processors/transforms/aggregator/serde/transform_aggregate_spill_writer.rs b/src/query/service/src/pipelines/processors/transforms/aggregator/serde/transform_aggregate_spill_writer.rs index 787a199fe537..3b3d56586b54 100644 --- a/src/query/service/src/pipelines/processors/transforms/aggregator/serde/transform_aggregate_spill_writer.rs +++ b/src/query/service/src/pipelines/processors/transforms/aggregator/serde/transform_aggregate_spill_writer.rs @@ -269,10 +269,13 @@ pub fn agg_spilling_aggregate_payload( // perf { - Profile::record_usize_profile(ProfileStatisticsName::SpillWriteCount, 1); - Profile::record_usize_profile(ProfileStatisticsName::SpillWriteBytes, write_bytes); + Profile::record_usize_profile(ProfileStatisticsName::RemoteSpillWriteCount, 1); Profile::record_usize_profile( - ProfileStatisticsName::SpillWriteTime, + ProfileStatisticsName::RemoteSpillWriteBytes, + write_bytes, + ); + Profile::record_usize_profile( + ProfileStatisticsName::RemoteSpillWriteTime, instant.elapsed().as_millis() as usize, ); } @@ -366,10 +369,13 @@ pub fn spilling_aggregate_payload( // perf { - Profile::record_usize_profile(ProfileStatisticsName::SpillWriteCount, 1); - Profile::record_usize_profile(ProfileStatisticsName::SpillWriteBytes, write_bytes); + Profile::record_usize_profile(ProfileStatisticsName::RemoteSpillWriteCount, 1); + Profile::record_usize_profile( + ProfileStatisticsName::RemoteSpillWriteBytes, + write_bytes, + ); Profile::record_usize_profile( - ProfileStatisticsName::SpillWriteTime, + ProfileStatisticsName::RemoteSpillWriteTime, instant.elapsed().as_millis() as usize, ); } diff --git a/src/query/service/src/pipelines/processors/transforms/aggregator/serde/transform_exchange_aggregate_serializer.rs b/src/query/service/src/pipelines/processors/transforms/aggregator/serde/transform_exchange_aggregate_serializer.rs index 8d621aba7ea6..7e38f9ec41e0 100644 --- a/src/query/service/src/pipelines/processors/transforms/aggregator/serde/transform_exchange_aggregate_serializer.rs +++ b/src/query/service/src/pipelines/processors/transforms/aggregator/serde/transform_exchange_aggregate_serializer.rs @@ -311,10 +311,13 @@ fn agg_spilling_aggregate_payload( // perf { - Profile::record_usize_profile(ProfileStatisticsName::SpillWriteCount, 1); - Profile::record_usize_profile(ProfileStatisticsName::SpillWriteBytes, write_bytes); + Profile::record_usize_profile(ProfileStatisticsName::RemoteSpillWriteCount, 1); Profile::record_usize_profile( - ProfileStatisticsName::SpillWriteTime, + ProfileStatisticsName::RemoteSpillWriteBytes, + write_bytes, + ); + Profile::record_usize_profile( + ProfileStatisticsName::RemoteSpillWriteTime, instant.elapsed().as_millis() as usize, ); } @@ -432,10 +435,13 @@ fn spilling_aggregate_payload( // perf { - Profile::record_usize_profile(ProfileStatisticsName::SpillWriteCount, 1); - Profile::record_usize_profile(ProfileStatisticsName::SpillWriteBytes, write_bytes); + Profile::record_usize_profile(ProfileStatisticsName::RemoteSpillWriteCount, 1); + Profile::record_usize_profile( + ProfileStatisticsName::RemoteSpillWriteBytes, + write_bytes, + ); Profile::record_usize_profile( - ProfileStatisticsName::SpillWriteTime, + ProfileStatisticsName::RemoteSpillWriteTime, instant.elapsed().as_millis() as usize, ); } diff --git a/src/query/service/src/pipelines/processors/transforms/aggregator/serde/transform_exchange_group_by_serializer.rs b/src/query/service/src/pipelines/processors/transforms/aggregator/serde/transform_exchange_group_by_serializer.rs index bdd2b95e29cd..d68a956d1ec9 100644 --- a/src/query/service/src/pipelines/processors/transforms/aggregator/serde/transform_exchange_group_by_serializer.rs +++ b/src/query/service/src/pipelines/processors/transforms/aggregator/serde/transform_exchange_group_by_serializer.rs @@ -365,10 +365,13 @@ fn agg_spilling_group_by_payload( // perf { - Profile::record_usize_profile(ProfileStatisticsName::SpillWriteCount, 1); - Profile::record_usize_profile(ProfileStatisticsName::SpillWriteBytes, write_bytes); + Profile::record_usize_profile(ProfileStatisticsName::RemoteSpillWriteCount, 1); Profile::record_usize_profile( - ProfileStatisticsName::SpillWriteTime, + ProfileStatisticsName::RemoteSpillWriteBytes, + write_bytes, + ); + Profile::record_usize_profile( + ProfileStatisticsName::RemoteSpillWriteTime, instant.elapsed().as_millis() as usize, ); } @@ -484,10 +487,13 @@ fn spilling_group_by_payload( // perf { - Profile::record_usize_profile(ProfileStatisticsName::SpillWriteCount, 1); - Profile::record_usize_profile(ProfileStatisticsName::SpillWriteBytes, write_bytes); + Profile::record_usize_profile(ProfileStatisticsName::RemoteSpillWriteCount, 1); + Profile::record_usize_profile( + ProfileStatisticsName::RemoteSpillWriteBytes, + write_bytes, + ); Profile::record_usize_profile( - ProfileStatisticsName::SpillWriteTime, + ProfileStatisticsName::RemoteSpillWriteTime, instant.elapsed().as_millis() as usize, ); } diff --git a/src/query/service/src/pipelines/processors/transforms/aggregator/serde/transform_group_by_spill_writer.rs b/src/query/service/src/pipelines/processors/transforms/aggregator/serde/transform_group_by_spill_writer.rs index 5a3a35219780..04d0c36b7f3d 100644 --- a/src/query/service/src/pipelines/processors/transforms/aggregator/serde/transform_group_by_spill_writer.rs +++ b/src/query/service/src/pipelines/processors/transforms/aggregator/serde/transform_group_by_spill_writer.rs @@ -267,10 +267,13 @@ pub fn agg_spilling_group_by_payload( // perf { - Profile::record_usize_profile(ProfileStatisticsName::SpillWriteCount, 1); - Profile::record_usize_profile(ProfileStatisticsName::SpillWriteBytes, write_bytes); + Profile::record_usize_profile(ProfileStatisticsName::RemoteSpillWriteCount, 1); Profile::record_usize_profile( - ProfileStatisticsName::SpillWriteTime, + ProfileStatisticsName::RemoteSpillWriteBytes, + write_bytes, + ); + Profile::record_usize_profile( + ProfileStatisticsName::RemoteSpillWriteTime, instant.elapsed().as_millis() as usize, ); } @@ -360,10 +363,13 @@ pub fn spilling_group_by_payload( // perf { - Profile::record_usize_profile(ProfileStatisticsName::SpillWriteCount, 1); - Profile::record_usize_profile(ProfileStatisticsName::SpillWriteBytes, write_bytes); + Profile::record_usize_profile(ProfileStatisticsName::RemoteSpillWriteCount, 1); + Profile::record_usize_profile( + ProfileStatisticsName::RemoteSpillWriteBytes, + write_bytes, + ); Profile::record_usize_profile( - ProfileStatisticsName::SpillWriteTime, + ProfileStatisticsName::RemoteSpillWriteTime, instant.elapsed().as_millis() as usize, ); } diff --git a/src/query/service/src/pipelines/processors/transforms/aggregator/serde/transform_spill_reader.rs b/src/query/service/src/pipelines/processors/transforms/aggregator/serde/transform_spill_reader.rs index 3bf4f8bd7d3a..f625de75db9b 100644 --- a/src/query/service/src/pipelines/processors/transforms/aggregator/serde/transform_spill_reader.rs +++ b/src/query/service/src/pipelines/processors/transforms/aggregator/serde/transform_spill_reader.rs @@ -227,15 +227,15 @@ impl Processor // perf { Profile::record_usize_profile( - ProfileStatisticsName::SpillReadCount, + ProfileStatisticsName::RemoteSpillReadCount, 1, ); Profile::record_usize_profile( - ProfileStatisticsName::SpillReadBytes, + ProfileStatisticsName::RemoteSpillReadBytes, data.len(), ); Profile::record_usize_profile( - ProfileStatisticsName::SpillReadTime, + ProfileStatisticsName::RemoteSpillReadTime, instant.elapsed().as_millis() as usize, ); } diff --git a/src/query/service/src/spillers/spiller.rs b/src/query/service/src/spillers/spiller.rs index 1a32b1654dda..cccf3be6da32 100644 --- a/src/query/service/src/spillers/spiller.rs +++ b/src/query/service/src/spillers/spiller.rs @@ -131,7 +131,10 @@ impl Spiller { let location = self.write_encodes(data_size, vec![encoded]).await?; // Record statistics. - record_write_profile(&instant, data_size); + match location { + Location::Remote(_) => record_remote_write_profile(&instant, data_size), + Location::Local(_) => record_local_write_profile(&instant, data_size), + } // Record columns layout for spilled data. self.columns_layout.insert(location.clone(), columns_layout); @@ -195,7 +198,10 @@ impl Spiller { let location = self.write_encodes(write_bytes, write_data).await?; // Record statistics. - record_write_profile(&instant, write_bytes); + match location { + Location::Remote(_) => record_remote_write_profile(&instant, write_bytes), + Location::Local(_) => record_local_write_profile(&instant, write_bytes), + } Ok(SpilledData::MergedPartition { location, @@ -213,7 +219,7 @@ impl Spiller { let block = match location { Location::Remote(loc) => { let data = self.operator.read(loc).await?.to_bytes(); - record_read_profile(&instant, data.len()); + record_remote_read_profile(&instant, data.len()); deserialize_block(columns_layout, &data) } Location::Local(path) => { @@ -221,7 +227,7 @@ impl Spiller { debug_assert_eq!(file_size, columns_layout.iter().sum::()); let (buf, range) = dma_read_file_range(path, 0..file_size as u64).await?; let data = &buf[range]; - record_read_profile(&instant, data.len()); + record_local_read_profile(&instant, data.len()); deserialize_block(columns_layout, data) } }; @@ -284,7 +290,10 @@ impl Spiller { }; // Record statistics. - record_read_profile(&instant, data.len()); + match location { + Location::Remote(_) => record_remote_read_profile(&instant, data.len()), + Location::Local(_) => record_local_read_profile(&instant, data.len()), + }; // Deserialize partitioned data block. let partitioned_data = partitions @@ -318,13 +327,13 @@ impl Spiller { .range(data_range) .await? .to_bytes(); - record_read_profile(&instant, data.len()); + record_remote_read_profile(&instant, data.len()); Ok(deserialize_block(columns_layout, &data)) } Location::Local(path) => { let (buf, range) = dma_read_file_range(path, data_range).await?; let data = &buf[range]; - record_read_profile(&instant, data.len()); + record_local_read_profile(&instant, data.len()); Ok(deserialize_block(columns_layout, data)) } } @@ -430,20 +439,38 @@ pub fn deserialize_block(columns_layout: &[usize], mut data: &[u8]) -> DataBlock DataBlock::new_from_columns(columns) } -pub fn record_write_profile(start: &Instant, write_bytes: usize) { - Profile::record_usize_profile(ProfileStatisticsName::SpillWriteCount, 1); - Profile::record_usize_profile(ProfileStatisticsName::SpillWriteBytes, write_bytes); +pub fn record_remote_write_profile(start: &Instant, write_bytes: usize) { + Profile::record_usize_profile(ProfileStatisticsName::RemoteSpillWriteCount, 1); + Profile::record_usize_profile(ProfileStatisticsName::RemoteSpillWriteBytes, write_bytes); + Profile::record_usize_profile( + ProfileStatisticsName::RemoteSpillWriteTime, + start.elapsed().as_millis() as usize, + ); +} + +pub fn record_remote_read_profile(start: &Instant, read_bytes: usize) { + Profile::record_usize_profile(ProfileStatisticsName::RemoteSpillReadCount, 1); + Profile::record_usize_profile(ProfileStatisticsName::RemoteSpillReadBytes, read_bytes); + Profile::record_usize_profile( + ProfileStatisticsName::RemoteSpillReadTime, + start.elapsed().as_millis() as usize, + ); +} + +pub fn record_local_write_profile(start: &Instant, write_bytes: usize) { + Profile::record_usize_profile(ProfileStatisticsName::LocalSpillWriteCount, 1); + Profile::record_usize_profile(ProfileStatisticsName::LocalSpillWriteBytes, write_bytes); Profile::record_usize_profile( - ProfileStatisticsName::SpillWriteTime, + ProfileStatisticsName::LocalSpillWriteTime, start.elapsed().as_millis() as usize, ); } -pub fn record_read_profile(start: &Instant, read_bytes: usize) { - Profile::record_usize_profile(ProfileStatisticsName::SpillReadCount, 1); - Profile::record_usize_profile(ProfileStatisticsName::SpillReadBytes, read_bytes); +pub fn record_local_read_profile(start: &Instant, read_bytes: usize) { + Profile::record_usize_profile(ProfileStatisticsName::LocalSpillReadCount, 1); + Profile::record_usize_profile(ProfileStatisticsName::LocalSpillReadBytes, read_bytes); Profile::record_usize_profile( - ProfileStatisticsName::SpillReadTime, + ProfileStatisticsName::LocalSpillReadTime, start.elapsed().as_millis() as usize, ); } From c529dfce8bb61a0d1b0a4c7e6b4fc96f2ebdd521 Mon Sep 17 00:00:00 2001 From: coldWater Date: Fri, 27 Sep 2024 13:26:55 +0800 Subject: [PATCH 26/40] defalut disable Signed-off-by: coldWater --- src/query/settings/src/settings_default.rs | 2 +- src/query/storages/common/cache/src/temp_dir.rs | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/src/query/settings/src/settings_default.rs b/src/query/settings/src/settings_default.rs index 3bfdf030babf..e0c00f56febd 100644 --- a/src/query/settings/src/settings_default.rs +++ b/src/query/settings/src/settings_default.rs @@ -461,7 +461,7 @@ impl DefaultSettings { range: Some(SettingRange::Numeric(0..=100)), }), ("window_partition_spilling_to_disk_bytes_limit", DefaultSettingValue { - value: UserSettingValue::UInt64(10<<30), + value: UserSettingValue::UInt64(0), desc: "Sets the maximum amount of local disk in bytes that each window partitioner can use before spilling data to storage during query execution.", mode: SettingMode::Both, range: Some(SettingRange::Numeric(0..=u64::MAX)), diff --git a/src/query/storages/common/cache/src/temp_dir.rs b/src/query/storages/common/cache/src/temp_dir.rs index a0b289619cba..f7ec110e5089 100644 --- a/src/query/storages/common/cache/src/temp_dir.rs +++ b/src/query/storages/common/cache/src/temp_dir.rs @@ -91,6 +91,9 @@ impl TempDirManager { limit: usize, query_id: &str, ) -> Option> { + if limit == 0 { + return None; + } self.root.as_ref()?; let path = self.root.as_ref().unwrap().join(query_id).into_boxed_path(); From fd9b15b590dff97fd8ee825a361e9de683ec4710 Mon Sep 17 00:00:00 2001 From: coldWater Date: Fri, 27 Sep 2024 13:30:00 +0800 Subject: [PATCH 27/40] update setting Signed-off-by: coldWater --- src/query/service/src/interpreters/hook/vacuum_hook.rs | 5 ++++- src/query/settings/src/settings_default.rs | 6 ++++++ src/query/settings/src/settings_getter_setter.rs | 4 ++++ .../query/window_function/window_partition_spill.test | 3 +++ tests/sqllogictests/suites/tpcds/spill.test | 3 +++ tests/sqllogictests/suites/tpch/spill.test | 3 +++ 6 files changed, 23 insertions(+), 1 deletion(-) diff --git a/src/query/service/src/interpreters/hook/vacuum_hook.rs b/src/query/service/src/interpreters/hook/vacuum_hook.rs index 94702eb34dd8..fc5615ed8f9b 100644 --- a/src/query/service/src/interpreters/hook/vacuum_hook.rs +++ b/src/query/service/src/interpreters/hook/vacuum_hook.rs @@ -73,7 +73,10 @@ pub fn hook_disk_temp_dir(query_ctx: &Arc) -> Result<()> { let mgr = TempDirManager::instance(); if mgr.drop_disk_spill_dir(&query_ctx.get_id())? && rand::thread_rng().gen_ratio(1, 10) { - let deleted = mgr.drop_disk_spill_dir_unknown(5)?; + let limit = query_ctx + .get_settings() + .get_spilling_to_disk_vacuum_unknown_temp_dirs_limit(); + let deleted = mgr.drop_disk_spill_dir_unknown(limit)?; if !deleted.is_empty() { warn!("Deleted residual temporary directories: {:?}", deleted) } diff --git a/src/query/settings/src/settings_default.rs b/src/query/settings/src/settings_default.rs index e0c00f56febd..2a14294f032f 100644 --- a/src/query/settings/src/settings_default.rs +++ b/src/query/settings/src/settings_default.rs @@ -303,6 +303,12 @@ impl DefaultSettings { mode: SettingMode::Both, range: Some(SettingRange::Numeric(0..=u64::MAX)), }), + ("spilling_to_disk_vacuum_unknown_temp_dirs_limit", DefaultSettingValue { + value: UserSettingValue::UInt64(u64::MAX), + desc: "Set the maximum number of directories to clean up. If there are some temporary dirs when another query is unexpectedly interrupted, which needs to be cleaned up after this query.", + mode: SettingMode::Both, + range: Some(SettingRange::Numeric(0..=u64::MAX)), + }), ("enable_merge_into_row_fetch", DefaultSettingValue { value: UserSettingValue::UInt64(1), desc: "Enable merge into row fetch optimization.", diff --git a/src/query/settings/src/settings_getter_setter.rs b/src/query/settings/src/settings_getter_setter.rs index 6c54802ff10c..dd22210af7ab 100644 --- a/src/query/settings/src/settings_getter_setter.rs +++ b/src/query/settings/src/settings_getter_setter.rs @@ -286,6 +286,10 @@ impl Settings { Ok(self.try_get_u64("join_spilling_buffer_threshold_per_proc_mb")? as usize) } + pub fn get_spilling_to_disk_vacuum_unknown_temp_dirs_limit(&self) -> Result { + Ok(self.try_get_u64("spilling_to_disk_vacuum_unknown_temp_dirs_limit")? as usize) + } + pub fn get_inlist_to_join_threshold(&self) -> Result { Ok(self.try_get_u64("inlist_to_join_threshold")? as usize) } diff --git a/tests/sqllogictests/suites/query/window_function/window_partition_spill.test b/tests/sqllogictests/suites/query/window_function/window_partition_spill.test index 2f268148905e..fccbd097486a 100644 --- a/tests/sqllogictests/suites/query/window_function/window_partition_spill.test +++ b/tests/sqllogictests/suites/query/window_function/window_partition_spill.test @@ -7,6 +7,9 @@ USE test_window_partition_spill statement ok set window_partition_spilling_bytes_threshold_per_proc = 1024 * 1024 * 1; +statement ok +set window_partition_spilling_to_disk_bytes_limit = 1024 * 1024 * 1024; + query T SELECT SUM(number + a + b) FROM ( diff --git a/tests/sqllogictests/suites/tpcds/spill.test b/tests/sqllogictests/suites/tpcds/spill.test index 9446d0209a66..366c85121b6d 100644 --- a/tests/sqllogictests/suites/tpcds/spill.test +++ b/tests/sqllogictests/suites/tpcds/spill.test @@ -23,6 +23,9 @@ set sort_spilling_bytes_threshold_per_proc = 1; statement ok set window_partition_spilling_memory_ratio = 1; +statement ok +set window_partition_spilling_to_disk_bytes_limit = 1024 * 1024 * 1024; + statement ok set window_partition_spilling_bytes_threshold_per_proc = 1; diff --git a/tests/sqllogictests/suites/tpch/spill.test b/tests/sqllogictests/suites/tpch/spill.test index 757154bbe84b..c393f2082b61 100644 --- a/tests/sqllogictests/suites/tpch/spill.test +++ b/tests/sqllogictests/suites/tpch/spill.test @@ -23,6 +23,9 @@ set sort_spilling_bytes_threshold_per_proc = 1; statement ok set window_partition_spilling_memory_ratio = 1; +statement ok +set window_partition_spilling_to_disk_bytes_limit = 1024 * 1024 * 1024; + statement ok set window_partition_spilling_bytes_threshold_per_proc = 1; From b343046f226b501e39aa78e8e610225b08f18df5 Mon Sep 17 00:00:00 2001 From: coldWater Date: Fri, 27 Sep 2024 14:41:37 +0800 Subject: [PATCH 28/40] fix Signed-off-by: coldWater --- src/query/service/src/interpreters/hook/vacuum_hook.rs | 2 +- src/query/storages/common/cache/src/temp_dir.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/query/service/src/interpreters/hook/vacuum_hook.rs b/src/query/service/src/interpreters/hook/vacuum_hook.rs index fc5615ed8f9b..21f01ea8cc6f 100644 --- a/src/query/service/src/interpreters/hook/vacuum_hook.rs +++ b/src/query/service/src/interpreters/hook/vacuum_hook.rs @@ -75,7 +75,7 @@ pub fn hook_disk_temp_dir(query_ctx: &Arc) -> Result<()> { if mgr.drop_disk_spill_dir(&query_ctx.get_id())? && rand::thread_rng().gen_ratio(1, 10) { let limit = query_ctx .get_settings() - .get_spilling_to_disk_vacuum_unknown_temp_dirs_limit(); + .get_spilling_to_disk_vacuum_unknown_temp_dirs_limit()?; let deleted = mgr.drop_disk_spill_dir_unknown(limit)?; if !deleted.is_empty() { warn!("Deleted residual temporary directories: {:?}", deleted) diff --git a/src/query/storages/common/cache/src/temp_dir.rs b/src/query/storages/common/cache/src/temp_dir.rs index f7ec110e5089..facba317aa80 100644 --- a/src/query/storages/common/cache/src/temp_dir.rs +++ b/src/query/storages/common/cache/src/temp_dir.rs @@ -335,7 +335,7 @@ mod tests { let config = SpillConfig { path: "test_data".to_string(), - reserved_disk_ratio: 0.99.into(), + reserved_disk_ratio: 0.01.into(), global_bytes_limit: 1 << 30, }; From 4bcaa38d3bdb980e2ab8e92f0eab0f88f48f0c17 Mon Sep 17 00:00:00 2001 From: coldWater Date: Fri, 27 Sep 2024 18:17:49 +0800 Subject: [PATCH 29/40] fix dma Signed-off-by: coldWater --- src/common/base/src/base/dma.rs | 35 ++++++++++++++++++++++++++------- 1 file changed, 28 insertions(+), 7 deletions(-) diff --git a/src/common/base/src/base/dma.rs b/src/common/base/src/base/dma.rs index 7c582c6e430b..bbbb6e898862 100644 --- a/src/common/base/src/base/dma.rs +++ b/src/common/base/src/base/dma.rs @@ -280,7 +280,16 @@ pub async fn dma_write_file_vectored<'a>( bufs: &'a [IoSlice<'a>], ) -> io::Result { let mut file = DmaFile::create(path.as_ref()).await?; - let buf = DmaBuffer::new(file.alignment, file.alignment); + + let file_length = bufs.iter().map(|buf| buf.len()).sum(); + if file_length == 0 { + return Ok(0); + } + + const BUFFER_SIZE: usize = 1024 * 1024; + let buffer_size = BUFFER_SIZE.min(file_length); + + let buf = DmaBuffer::new(file.align_up(buffer_size), file.alignment); file.set_buffer(buf); for buf in bufs { @@ -300,7 +309,6 @@ pub async fn dma_write_file_vectored<'a>( } } - let file_length = bufs.iter().map(|buf| buf.len()).sum(); let len = file.buffer().len(); if len > 0 { let align_up = file.align_up(len); @@ -326,9 +334,12 @@ pub async fn dma_read_file( file.set_buffer(buf); let mut n = 0; - let read_n = file.alignment; loop { - file = asyncify(move || file.read_direct(read_n).map(|_| file)).await?; + file = asyncify(move || { + let remain = file.buffer().remaining(); + file.read_direct(remain).map(|_| file) + }) + .await?; let buf = file.buffer(); if buf.is_empty() { @@ -370,9 +381,19 @@ pub async fn dma_read_file_range( } } - while file.buffer().remaining() > 0 { - let read_n = file.buffer().remaining(); - file = asyncify(move || file.read_direct(read_n).map(|_| file)).await?; + let mut n; + loop { + (file, n) = asyncify(move || { + let remain = file.buffer().remaining(); + file.read_direct(remain).map(|n| (file, n)) + }) + .await?; + if align_start + file.buffer().len() >= range.end as usize { + break; + } + if n == 0 { + return Err(io::Error::new(io::ErrorKind::UnexpectedEof, "")); + } } let rt_range = range.start as usize - align_start..range.end as usize - align_start; From 8520d8073fbe73e6bcbed2043f0a391de6c0994d Mon Sep 17 00:00:00 2001 From: coldWater Date: Sun, 29 Sep 2024 18:15:21 +0800 Subject: [PATCH 30/40] fix Signed-off-by: coldWater --- src/common/base/src/base/dma.rs | 4 +++- .../window/partition/transform_window_partition_collect.rs | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/common/base/src/base/dma.rs b/src/common/base/src/base/dma.rs index bbbb6e898862..3c289068a317 100644 --- a/src/common/base/src/base/dma.rs +++ b/src/common/base/src/base/dma.rs @@ -329,8 +329,9 @@ pub async fn dma_read_file( path: impl AsRef, mut writer: impl io::Write, ) -> io::Result { + const BUFFER_SIZE: usize = 1024 * 1024; let mut file = DmaFile::open(path.as_ref()).await?; - let buf = DmaBuffer::new(file.alignment, file.alignment); + let buf = DmaBuffer::new(file.align_up(BUFFER_SIZE), file.alignment); file.set_buffer(buf); let mut n = 0; @@ -347,6 +348,7 @@ pub async fn dma_read_file( } n += buf.len(); writer.write_all(buf)?; + // WARN: Is it possible to have a short read but not eof? let eof = buf.remaining() > 0; unsafe { file.mut_buffer().set_len(0) } if eof { diff --git a/src/query/service/src/pipelines/processors/transforms/window/partition/transform_window_partition_collect.rs b/src/query/service/src/pipelines/processors/transforms/window/partition/transform_window_partition_collect.rs index 431d0208e0d9..f983b6208e65 100644 --- a/src/query/service/src/pipelines/processors/transforms/window/partition/transform_window_partition_collect.rs +++ b/src/query/service/src/pipelines/processors/transforms/window/partition/transform_window_partition_collect.rs @@ -128,7 +128,7 @@ impl TransformWindowPartitionCollect { // Create the window partition buffer. let sort_block_size = settings.get_window_partition_sort_block_size()? as usize; let buffer = - WindowPartitionBuffer::new(spiller, num_partitions, sort_block_size, spill_settings)?; + WindowPartitionBuffer::new(spiller, partitions.len(), sort_block_size, spill_settings)?; let max_block_size = settings.get_max_block_size()? as usize; let enable_loser_tree = settings.get_enable_loser_tree_merge_sort()?; From 1583f7725e57a4219a934b5b5155cb974264b06c Mon Sep 17 00:00:00 2001 From: coldWater Date: Mon, 30 Sep 2024 00:36:48 +0800 Subject: [PATCH 31/40] refactor dma Signed-off-by: coldWater --- src/common/base/src/base/dma.rs | 183 ++++++++++------------ src/common/base/src/base/mod.rs | 1 + src/query/service/src/spillers/spiller.rs | 14 +- 3 files changed, 92 insertions(+), 106 deletions(-) diff --git a/src/common/base/src/base/dma.rs b/src/common/base/src/base/dma.rs index 3c289068a317..1882ef89ce6f 100644 --- a/src/common/base/src/base/dma.rs +++ b/src/common/base/src/base/dma.rs @@ -12,18 +12,18 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::alloc::alloc; -use std::alloc::dealloc; +use std::alloc::AllocError; +use std::alloc::Allocator; +use std::alloc::Global; use std::alloc::Layout; use std::io; use std::io::IoSlice; use std::io::SeekFrom; -use std::ops::Deref; -use std::ops::DerefMut; use std::ops::Range; use std::os::fd::BorrowedFd; use std::os::unix::io::AsRawFd; use std::path::Path; +use std::ptr::NonNull; use rustix::fs::OFlags; use tokio::fs::File; @@ -31,103 +31,75 @@ use tokio::io::AsyncSeekExt; use crate::runtime::spawn_blocking; -/// An aligned buffer used to perform io on a `DmaFile`. -#[derive(Debug)] -pub struct DmaBuffer { - cap: usize, - len: usize, +unsafe impl Send for DmaAllocator {} + +pub struct DmaAllocator { align: usize, - data: *mut u8, } -unsafe impl Send for DmaBuffer {} - -impl DmaBuffer { - /// Allocates an aligned buffer. - fn new(cap: usize, align: usize) -> DmaBuffer { - let layout = Layout::from_size_align(cap, align).unwrap(); - let data = unsafe { alloc(layout) }; - Self { - data, - cap, - align, - len: 0, - } +impl DmaAllocator { + pub fn new(align: usize) -> DmaAllocator { + DmaAllocator { align } } - /// Sets the internal length of the buffer. The caller must ensure that the memory is - /// initialized until `new_len` before calling. - pub unsafe fn set_len(&mut self, new_len: usize) { - debug_assert!(new_len <= self.cap); - self.len = new_len; + fn real_layout(&self, layout: Layout) -> Layout { + Layout::from_size_align(self.real_cap(layout.size()), self.align).unwrap() } - /// Returns the number of initialized bytes in the buffer. - pub fn len(&self) -> usize { - self.len + fn real_cap(&self, cap: usize) -> usize { + align_up(self.align, cap) } +} - /// Returns the capacity for this `DmaBuffer`. - pub fn capacity(&self) -> usize { - self.cap +unsafe impl Allocator for DmaAllocator { + fn allocate(&self, layout: Layout) -> Result, AllocError> { + Global {}.allocate(self.real_layout(layout)) } - /// Returns the remaining capacity in the buffer. - pub fn remaining(&self) -> usize { - self.capacity() - self.len() + fn allocate_zeroed(&self, layout: Layout) -> Result, AllocError> { + Global {}.allocate_zeroed(self.real_layout(layout)) } - /// Returns a raw pointer to the buffer's data. - pub fn as_ptr(&self) -> *const u8 { - self.data as *const _ + unsafe fn grow( + &self, + ptr: NonNull, + old_layout: Layout, + new_layout: Layout, + ) -> Result, AllocError> { + Global {}.grow( + ptr, + self.real_layout(old_layout), + self.real_layout(new_layout), + ) } - /// Returns an unsafe mutable pointer to the buffer's data. - pub fn as_mut_ptr(&mut self) -> *mut u8 { - self.data + unsafe fn grow_zeroed( + &self, + ptr: NonNull, + old_layout: Layout, + new_layout: Layout, + ) -> Result, AllocError> { + Global {}.grow_zeroed( + ptr, + self.real_layout(old_layout), + self.real_layout(new_layout), + ) } - /// Extends `self` with the content of `other`. - /// Panics if `self` doesn't have enough capacity left to contain `other`. - pub fn extend_from_slice(&mut self, other: &[u8]) { - assert!(other.len() <= self.remaining()); - - let buf = unsafe { std::slice::from_raw_parts_mut(self.data.add(self.len()), other.len()) }; - buf.copy_from_slice(other); - self.len += other.len(); + unsafe fn deallocate(&self, ptr: std::ptr::NonNull, layout: Layout) { + Global {}.deallocate(ptr, self.real_layout(layout)) } } -impl Deref for DmaBuffer { - type Target = [u8]; - - fn deref(&self) -> &Self::Target { - unsafe { std::slice::from_raw_parts(self.data, self.len()) } - } -} +type DmaBuffer = Vec; -impl DerefMut for DmaBuffer { - fn deref_mut(&mut self) -> &mut Self::Target { - unsafe { std::slice::from_raw_parts_mut(self.data, self.len()) } - } -} +pub fn dma_buffer_as_vec(mut buf: DmaBuffer) -> Vec { + let ptr = buf.as_mut_ptr(); + let len = buf.len(); + let cap = buf.allocator().real_cap(buf.capacity()); + std::mem::forget(buf); -impl Drop for DmaBuffer { - fn drop(&mut self) { - let layout = Layout::from_size_align(self.cap, self.align).unwrap(); - unsafe { dealloc(self.data, layout) } - } -} - -impl From for Vec { - fn from(mut val: DmaBuffer) -> Self { - let length = val.len; - let cap = val.cap; - - let v = unsafe { Vec::from_raw_parts(val.as_mut_ptr(), length, cap) }; - std::mem::forget(val); - v - } + unsafe { Vec::from_raw_parts(ptr, len, cap) } } /// A `DmaFile` is similar to a `File`, but it is opened with the `O_DIRECT` file in order to @@ -200,7 +172,7 @@ impl DmaFile { if n != buf.len() { return Err(io::Error::new(io::ErrorKind::Other, "short write")); } - unsafe { self.mut_buffer().set_len(0) }; + self.mut_buffer().clear(); Ok(n) } Err(e) => Err(e.into()), @@ -210,14 +182,14 @@ impl DmaFile { fn read_direct(&mut self, n: usize) -> io::Result { let Self { fd, buf, .. } = self; let buf = buf.as_mut().unwrap(); - if n > buf.remaining() { + if n > buf.capacity() - buf.len() { return Err(io::Error::new(io::ErrorKind::Other, "buf not sufficient")); } let start = buf.len(); unsafe { buf.set_len(buf.len() + n) }; match rustix::io::read(fd, &mut (*buf)[start..]) { Ok(n) => { - unsafe { buf.set_len(start + n) }; + buf.truncate(start + n); Ok(n) } Err(e) => Err(e.into()), @@ -289,19 +261,23 @@ pub async fn dma_write_file_vectored<'a>( const BUFFER_SIZE: usize = 1024 * 1024; let buffer_size = BUFFER_SIZE.min(file_length); - let buf = DmaBuffer::new(file.align_up(buffer_size), file.alignment); + let buf = Vec::with_capacity_in( + file.align_up(buffer_size), + DmaAllocator::new(file.alignment), + ); file.set_buffer(buf); for buf in bufs { let mut buf = &buf[..]; while !buf.is_empty() { - if file.buffer().remaining() == 0 { + let dst = file.buffer(); + if dst.capacity() == dst.len() { file = asyncify(move || file.write_direct().map(|_| file)).await?; } let dst = file.mut_buffer(); - let remaining = dst.remaining(); + let remaining = dst.capacity() - dst.len(); let n = buf.len().min(remaining); let (left, right) = buf.split_at(n); dst.extend_from_slice(left); @@ -313,12 +289,15 @@ pub async fn dma_write_file_vectored<'a>( if len > 0 { let align_up = file.align_up(len); if align_up == len { - asyncify(move || file.write_direct().map(|_| file)).await?; + asyncify(move || file.write_direct()).await?; } else { let dst = file.mut_buffer(); unsafe { dst.set_len(align_up) } - file = asyncify(move || file.write_direct().map(|_| file)).await?; - asyncify(move || file.truncate(file_length).map(|_| file)).await?; + asyncify(move || { + file.write_direct()?; + file.truncate(file_length) + }) + .await?; } } @@ -331,13 +310,17 @@ pub async fn dma_read_file( ) -> io::Result { const BUFFER_SIZE: usize = 1024 * 1024; let mut file = DmaFile::open(path.as_ref()).await?; - let buf = DmaBuffer::new(file.align_up(BUFFER_SIZE), file.alignment); + let buf = Vec::with_capacity_in( + file.align_up(BUFFER_SIZE), + DmaAllocator::new(file.alignment), + ); file.set_buffer(buf); let mut n = 0; loop { file = asyncify(move || { - let remain = file.buffer().remaining(); + let buf = file.buffer(); + let remain = buf.capacity() - buf.len(); file.read_direct(remain).map(|_| file) }) .await?; @@ -349,7 +332,7 @@ pub async fn dma_read_file( n += buf.len(); writer.write_all(buf)?; // WARN: Is it possible to have a short read but not eof? - let eof = buf.remaining() > 0; + let eof = buf.capacity() > buf.len(); unsafe { file.mut_buffer().set_len(0) } if eof { return Ok(n); @@ -361,16 +344,12 @@ pub async fn dma_read_file_range( path: impl AsRef, range: Range, ) -> io::Result<(DmaBuffer, Range)> { - if range.is_empty() { - return Ok((DmaBuffer::new(2, 2), 0..0)); - } - let mut file = DmaFile::open(path.as_ref()).await?; let align_start = file.align_down(range.start as usize); let align_end = file.align_up(range.end as usize); - let buf = DmaBuffer::new(align_end - align_start, file.alignment); + let buf = Vec::with_capacity_in(align_end - align_start, DmaAllocator::new(file.alignment)); file.set_buffer(buf); if align_start != 0 { @@ -386,7 +365,8 @@ pub async fn dma_read_file_range( let mut n; loop { (file, n) = asyncify(move || { - let remain = file.buffer().remaining(); + let buf = file.buffer(); + let remain = buf.capacity() - buf.len(); file.read_direct(remain).map(|n| (file, n)) }) .await?; @@ -421,6 +401,10 @@ mod tests { run_test(4096 * 2 - 1).await.unwrap(); run_test(4096 * 2).await.unwrap(); run_test(4096 * 2 + 1).await.unwrap(); + + run_test(1024 * 1024 * 3 - 1).await.unwrap(); + run_test(1024 * 1024 * 3).await.unwrap(); + run_test(1024 * 1024 * 3 + 1).await.unwrap(); } async fn run_test(n: usize) -> io::Result<()> { @@ -438,6 +422,9 @@ mod tests { assert_eq!(length, want.len()); assert_eq!(got, want); + let (buf, range) = dma_read_file_range(filename, 0..length as u64).await?; + assert_eq!(&buf[range], &want); + std::fs::remove_file(filename)?; Ok(()) } @@ -494,7 +481,7 @@ mod tests { dma_write_file_vectored(filename, &bufs).await.unwrap(); let mut file = DmaFile::open(filename).await.unwrap(); - let buf = DmaBuffer::new(file_size, file.alignment); + let buf = Vec::with_capacity_in(file_size, DmaAllocator::new(file.alignment)); file.set_buffer(buf); let got = file.read_direct(alignment).unwrap(); diff --git a/src/common/base/src/base/mod.rs b/src/common/base/src/base/mod.rs index 4ba645c70c31..5ac11ea7a46c 100644 --- a/src/common/base/src/base/mod.rs +++ b/src/common/base/src/base/mod.rs @@ -28,6 +28,7 @@ mod take_mut; mod uniq_id; mod watch_notify; +pub use dma::dma_buffer_as_vec; pub use dma::dma_read_file; pub use dma::dma_read_file_range; pub use dma::dma_write_file_vectored; diff --git a/src/query/service/src/spillers/spiller.rs b/src/query/service/src/spillers/spiller.rs index cccf3be6da32..d13229b65df6 100644 --- a/src/query/service/src/spillers/spiller.rs +++ b/src/query/service/src/spillers/spiller.rs @@ -21,6 +21,7 @@ use std::ops::Range; use std::sync::Arc; use std::time::Instant; +use databend_common_base::base::dma_buffer_as_vec; use databend_common_base::base::dma_read_file_range; use databend_common_base::base::dma_write_file_vectored; use databend_common_base::base::GlobalUniqName; @@ -125,7 +126,7 @@ impl Spiller { let instant = Instant::now(); // Spill data to storage. - let encoded = EncodedBlock::from_block(&data_block); + let encoded = EncodedBlock::from_block(data_block); let columns_layout = encoded.columns_layout(); let data_size = encoded.size(); let location = self.write_encodes(data_size, vec![encoded]).await?; @@ -184,7 +185,7 @@ impl Spiller { for (partition_id, data_block) in partitioned_data.into_iter() { let begin = write_bytes; - let encoded = EncodedBlock::from_block(&data_block); + let encoded = EncodedBlock::from_block(data_block); let columns_layout = encoded.columns_layout(); let data_size = encoded.size(); @@ -280,12 +281,9 @@ impl Spiller { let (mut buf, range) = dma_read_file_range(path, 0..file_size as u64).await?; assert_eq!(range.start, 0); - unsafe { - buf.set_len(range.end); - } + buf.truncate(range.end); - let buf: Vec = buf.into(); - buf.into() + dma_buffer_as_vec(buf).into() } }; @@ -403,7 +401,7 @@ pub enum Location { pub struct EncodedBlock(pub Vec>); impl EncodedBlock { - pub fn from_block(block: &DataBlock) -> Self { + pub fn from_block(block: DataBlock) -> Self { let data = block .columns() .iter() From 657c1c2e8580752d16540308f6cae29d25f61f4c Mon Sep 17 00:00:00 2001 From: coldWater Date: Sun, 29 Sep 2024 19:41:05 +0800 Subject: [PATCH 32/40] change config Signed-off-by: coldWater --- src/query/config/src/config.rs | 2 +- src/query/config/src/inner.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/query/config/src/config.rs b/src/query/config/src/config.rs index c1a9dfab71bb..dc65b38d9aed 100644 --- a/src/query/config/src/config.rs +++ b/src/query/config/src/config.rs @@ -2942,7 +2942,7 @@ pub struct SpillConfig { #[clap( long, value_name = "VALUE", - default_value = "./.databend/temp/_query_spill" + default_value = "./.databend/_cache/temp/_query_spill" )] pub spill_local_disk_path: String, diff --git a/src/query/config/src/inner.rs b/src/query/config/src/inner.rs index 9b508d9daf21..bc2aec350256 100644 --- a/src/query/config/src/inner.rs +++ b/src/query/config/src/inner.rs @@ -722,7 +722,7 @@ pub struct SpillConfig { impl Default for SpillConfig { fn default() -> Self { Self { - path: "./.databend/temp/_query_spill".to_string(), + path: "./.databend/_cache/temp/_query_spill".to_string(), reserved_disk_ratio: OrderedFloat(0.3), global_bytes_limit: u64::MAX, } From 0e70745913c5fd35c97d4db30fd04d557635ade7 Mon Sep 17 00:00:00 2001 From: coldWater Date: Mon, 30 Sep 2024 16:07:48 +0800 Subject: [PATCH 33/40] Alignment Signed-off-by: coldWater --- src/common/base/src/base/dma.rs | 46 ++++++++++++++++----------------- src/common/base/src/lib.rs | 1 + 2 files changed, 23 insertions(+), 24 deletions(-) diff --git a/src/common/base/src/base/dma.rs b/src/common/base/src/base/dma.rs index 1882ef89ce6f..063b9d5467fd 100644 --- a/src/common/base/src/base/dma.rs +++ b/src/common/base/src/base/dma.rs @@ -23,6 +23,7 @@ use std::ops::Range; use std::os::fd::BorrowedFd; use std::os::unix::io::AsRawFd; use std::path::Path; +use std::ptr::Alignment; use std::ptr::NonNull; use rustix::fs::OFlags; @@ -33,21 +34,19 @@ use crate::runtime::spawn_blocking; unsafe impl Send for DmaAllocator {} -pub struct DmaAllocator { - align: usize, -} +pub struct DmaAllocator(Alignment); impl DmaAllocator { - pub fn new(align: usize) -> DmaAllocator { - DmaAllocator { align } + pub fn new(align: Alignment) -> DmaAllocator { + DmaAllocator(align) } fn real_layout(&self, layout: Layout) -> Layout { - Layout::from_size_align(self.real_cap(layout.size()), self.align).unwrap() + Layout::from_size_align(layout.size(), self.0.as_usize()).unwrap() } fn real_cap(&self, cap: usize) -> usize { - align_up(self.align, cap) + align_up(self.0, cap) } } @@ -106,7 +105,7 @@ pub fn dma_buffer_as_vec(mut buf: DmaBuffer) -> Vec { /// perform direct IO. struct DmaFile { fd: File, - alignment: usize, + alignment: Alignment, buf: Option, } @@ -145,15 +144,14 @@ impl DmaFile { } /// Aligns `value` down to the memory alignment requirement for this file. - #[allow(dead_code)] pub fn align_down(&self, value: usize) -> usize { align_down(self.alignment, value) } /// Return the alignment requirement for this file. The returned alignment value can be used /// to allocate a buffer to use with this file: - #[allow(dead_code)] - pub fn alignment(&self) -> usize { + #[expect(dead_code)] + pub fn alignment(&self) -> Alignment { self.alignment } @@ -205,17 +203,17 @@ impl DmaFile { } } -pub fn align_up(alignment: usize, value: usize) -> usize { - (value + alignment - 1) & !(alignment - 1) +pub fn align_up(alignment: Alignment, value: usize) -> usize { + (value + alignment.as_usize() - 1) & alignment.mask() } -pub fn align_down(alignment: usize, value: usize) -> usize { - value & !(alignment - 1) +pub fn align_down(alignment: Alignment, value: usize) -> usize { + value & alignment.mask() } async fn open_dma(file: File) -> io::Result { let stat = fstatvfs(&file).await?; - let alignment = stat.f_bsize.max(512) as usize; + let alignment = Alignment::new(stat.f_bsize.max(512) as usize).unwrap(); Ok(DmaFile { fd: file, @@ -261,16 +259,16 @@ pub async fn dma_write_file_vectored<'a>( const BUFFER_SIZE: usize = 1024 * 1024; let buffer_size = BUFFER_SIZE.min(file_length); - let buf = Vec::with_capacity_in( + let dma_buf = Vec::with_capacity_in( file.align_up(buffer_size), DmaAllocator::new(file.alignment), ); - file.set_buffer(buf); + file.set_buffer(dma_buf); - for buf in bufs { - let mut buf = &buf[..]; + for src in bufs { + let mut src = &src[..]; - while !buf.is_empty() { + while !src.is_empty() { let dst = file.buffer(); if dst.capacity() == dst.len() { file = asyncify(move || file.write_direct().map(|_| file)).await?; @@ -278,10 +276,10 @@ pub async fn dma_write_file_vectored<'a>( let dst = file.mut_buffer(); let remaining = dst.capacity() - dst.len(); - let n = buf.len().min(remaining); - let (left, right) = buf.split_at(n); + let n = src.len().min(remaining); + let (left, right) = src.split_at(n); dst.extend_from_slice(left); - buf = right; + src = right; } } diff --git a/src/common/base/src/lib.rs b/src/common/base/src/lib.rs index 2fa7b8dd03dc..1e3706521f71 100644 --- a/src/common/base/src/lib.rs +++ b/src/common/base/src/lib.rs @@ -24,6 +24,7 @@ #![feature(alloc_error_hook)] #![feature(slice_swap_unchecked)] #![feature(variant_count)] +#![feature(ptr_alignment_type)] pub mod base; pub mod containers; From b06b3d73cbfd6eecf6f465e7cea824dc8a2853db Mon Sep 17 00:00:00 2001 From: coldWater Date: Mon, 30 Sep 2024 21:26:57 +0800 Subject: [PATCH 34/40] cloud test Signed-off-by: coldWater --- .../src/pipelines/builders/builder_window.rs | 5 +- .../storages/common/cache/src/temp_dir.rs | 159 ++++++++++-------- 2 files changed, 90 insertions(+), 74 deletions(-) diff --git a/src/query/service/src/pipelines/builders/builder_window.rs b/src/query/service/src/pipelines/builders/builder_window.rs index 863b69cc3210..d94f4f9eaf94 100644 --- a/src/query/service/src/pipelines/builders/builder_window.rs +++ b/src/query/service/src/pipelines/builders/builder_window.rs @@ -172,8 +172,9 @@ impl PipelineBuilder { ); let disk_bytes_limit = settings.get_window_partition_spilling_to_disk_bytes_limit()?; - let disk_spill = - TempDirManager::instance().get_disk_spill_dir(disk_bytes_limit, &self.ctx.get_id()); + let temp_dir_manager = TempDirManager::instance(); + temp_dir_manager.dir_xxxx()?; + let disk_spill = temp_dir_manager.get_disk_spill_dir(disk_bytes_limit, &self.ctx.get_id()); let window_spill_settings = WindowSpillSettings::new(&settings, num_processors)?; let have_order_col = window_partition.after_exchange.unwrap_or(false); diff --git a/src/query/storages/common/cache/src/temp_dir.rs b/src/query/storages/common/cache/src/temp_dir.rs index facba317aa80..8b4df188dc02 100644 --- a/src/query/storages/common/cache/src/temp_dir.rs +++ b/src/query/storages/common/cache/src/temp_dir.rs @@ -57,18 +57,7 @@ impl TempDirManager { .join(tenant_id) .into_boxed_path(); - if let Err(e) = remove_dir_all(&path) { - if !matches!(e.kind(), ErrorKind::NotFound) { - Err(e)?; - } - } - - create_dir_all(&path)?; - - let stat = statvfs(path.as_ref()).map_err(|e| ErrorCode::Internal(e.to_string()))?; - let reserved = (stat.f_blocks as f64 * *config.reserved_disk_ratio) as u64; - - (Some(path), reserved) + (Some(path), 0) }; GlobalInstance::set(Arc::new(Self { @@ -82,6 +71,30 @@ impl TempDirManager { Ok(()) } + pub fn dir_xxxx(&self) -> Result<()> { + let Some(path) = &self.root else { + return Err(ErrorCode::Internal(format!("emtpy root"))); + }; + + if let Err(e) = remove_dir_all(path) { + if !matches!(e.kind(), ErrorKind::NotFound) { + return Err(ErrorCode::Internal(format!( + "remove_dir_all {:?} {}", + path, e, + ))); + } + } + + create_dir_all(path) + .map_err(|e| ErrorCode::Internal(format!("create_dir_all {:?} {}", path, e)))?; + + let _stat = statvfs(path.as_ref()) + .map_err(|e| ErrorCode::Internal(format!("statvfs {:?} {}", path, e)))?; + // let reserved = (stat.f_blocks as f64 * *config.reserved_disk_ratio) as u64; + + Ok(()) + } + pub fn instance() -> Arc { GlobalInstance::get() } @@ -94,9 +107,8 @@ impl TempDirManager { if limit == 0 { return None; } - self.root.as_ref()?; - let path = self.root.as_ref().unwrap().join(query_id).into_boxed_path(); + let path = self.root.as_ref()?.join(query_id).into_boxed_path(); let mut group = self.group.lock().unwrap(); let dir = match group.dirs.entry(path.clone()) { Entry::Occupied(o) => TempDir { @@ -322,84 +334,87 @@ impl Drop for InnerPath { #[cfg(test)] mod tests { - use std::assert_matches::assert_matches; - use std::fs; - use std::sync::atomic::Ordering; + // use std::assert_matches::assert_matches; + // use std::fs; + // use std::sync::atomic::Ordering; - use super::*; + // use super::*; - #[test] - fn test_temp_dir() -> Result<()> { - let thread = std::thread::current(); - GlobalInstance::init_testing(thread.name().unwrap()); - - let config = SpillConfig { - path: "test_data".to_string(), - reserved_disk_ratio: 0.01.into(), - global_bytes_limit: 1 << 30, - }; + // #[test] + // fn test_temp_dir() -> Result<()> { + // let thread = std::thread::current(); + // GlobalInstance::init_testing(thread.name().unwrap()); - TempDirManager::init(&config, "test_tenant")?; + // let config = SpillConfig { + // path: "test_data".to_string(), + // reserved_disk_ratio: 0.01.into(), + // global_bytes_limit: 1 << 30, + // }; - let mgr = TempDirManager::instance(); - let dir = mgr.get_disk_spill_dir(1 << 30, "some_query").unwrap(); - let path = dir.new_file_with_size(100)?.unwrap(); + // TempDirManager::init(&config, "test_tenant")?; - println!("{:?}", &path); + // let mgr = TempDirManager::instance(); + // let dir = mgr.get_disk_spill_dir(1 << 30, "some_query").unwrap(); + // let path = dir.new_file_with_size(100)?.unwrap(); - fs::write(&path, vec![b'a'; 100])?; + // println!("{:?}", &path); - assert_eq!(1, dir.dir_info.count.load(Ordering::Relaxed)); - assert_eq!(100, *dir.dir_info.size.lock().unwrap()); + // fs::write(&path, vec![b'a'; 100])?; - let path_str = path.as_ref().to_str().unwrap().to_string(); - drop(path); + // assert_eq!(1, dir.dir_info.count.load(Ordering::Relaxed)); + // assert_eq!(100, *dir.dir_info.size.lock().unwrap()); - assert_eq!(0, dir.dir_info.count.load(Ordering::Relaxed)); - assert_eq!(0, *dir.dir_info.size.lock().unwrap()); + // let path_str = path.as_ref().to_str().unwrap().to_string(); + // drop(path); - assert_matches!(fs::read_to_string(path_str), Err(_)); + // assert_eq!(0, dir.dir_info.count.load(Ordering::Relaxed)); + // assert_eq!(0, *dir.dir_info.size.lock().unwrap()); - mgr.drop_disk_spill_dir("some_query")?; + // assert_matches!(fs::read_to_string(path_str), Err(_)); - remove_dir_all("test_data")?; + // mgr.drop_disk_spill_dir("some_query")?; - Ok(()) - } + // remove_dir_all("test_data")?; - #[test] - fn test_drop_disk_spill_dir_unknown() -> Result<()> { - let thread = std::thread::current(); - GlobalInstance::init_testing(thread.name().unwrap()); - - let config = SpillConfig { - path: "test_data2".to_string(), - reserved_disk_ratio: 0.99.into(), - global_bytes_limit: 1 << 30, - }; + // Ok(()) + // } - TempDirManager::init(&config, "test_tenant")?; + // #[test] + // fn test_drop_disk_spill_dir_unknown() -> Result<()> { + // let thread = std::thread::current(); + // GlobalInstance::init_testing(thread.name().unwrap()); - let mgr = TempDirManager::instance(); - mgr.get_disk_spill_dir(1 << 30, "some_query").unwrap(); + // let config = SpillConfig { + // path: "test_data2".to_string(), + // reserved_disk_ratio: 0.99.into(), + // global_bytes_limit: 1 << 30, + // }; - create_dir("test_data2/test_tenant/unknown_query1")?; - create_dir("test_data2/test_tenant/unknown_query2")?; + // TempDirManager::init(&config, "test_tenant")?; - let mut deleted = mgr.drop_disk_spill_dir_unknown(10)?; + // let mgr = TempDirManager::instance(); + // mgr.get_disk_spill_dir(1 << 30, "some_query").unwrap(); - deleted.sort(); + // create_dir("test_data2/test_tenant/unknown_query1")?; + // create_dir("test_data2/test_tenant/unknown_query2")?; - assert_eq!( - vec![ - PathBuf::from("test_data2/test_tenant/unknown_query1").into_boxed_path(), - PathBuf::from("test_data2/test_tenant/unknown_query2").into_boxed_path(), - ], - deleted - ); + // let mut deleted = mgr.drop_disk_spill_dir_unknown(10)?; - remove_dir_all("test_data2")?; + // deleted.sort(); - Ok(()) - } + // assert_eq!( + // vec![ + // PathBuf::from("test_data2/test_tenant/unknown_query1").into_boxed_path(), + // PathBuf::from("test_data2/test_tenant/unknown_query2").into_boxed_path(), + // ], + // deleted + // ); + + // remove_dir_all("test_data2")?; + + // Ok(()) + // } + + #[test] + fn test() {} } From 37a2ec6590ee83a4d645f448932361f57bb002f1 Mon Sep 17 00:00:00 2001 From: coldWater Date: Mon, 30 Sep 2024 21:44:10 +0800 Subject: [PATCH 35/40] x Signed-off-by: coldWater --- src/query/storages/common/cache/src/temp_dir.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/query/storages/common/cache/src/temp_dir.rs b/src/query/storages/common/cache/src/temp_dir.rs index 8b4df188dc02..2f80ea91a42a 100644 --- a/src/query/storages/common/cache/src/temp_dir.rs +++ b/src/query/storages/common/cache/src/temp_dir.rs @@ -73,7 +73,7 @@ impl TempDirManager { pub fn dir_xxxx(&self) -> Result<()> { let Some(path) = &self.root else { - return Err(ErrorCode::Internal(format!("emtpy root"))); + return Err(ErrorCode::Internal(format!("empty root"))); }; if let Err(e) = remove_dir_all(path) { From 36162bfd92efcc600809185f7679fde4bbfce2a8 Mon Sep 17 00:00:00 2001 From: coldWater Date: Mon, 30 Sep 2024 23:32:00 +0800 Subject: [PATCH 36/40] clean Signed-off-by: coldWater --- src/query/config/src/config.rs | 2 +- src/query/config/src/inner.rs | 2 +- .../src/pipelines/builders/builder_window.rs | 1 - .../storages/common/cache/src/temp_dir.rs | 161 ++++++++---------- 4 files changed, 77 insertions(+), 89 deletions(-) diff --git a/src/query/config/src/config.rs b/src/query/config/src/config.rs index dc65b38d9aed..c1a9dfab71bb 100644 --- a/src/query/config/src/config.rs +++ b/src/query/config/src/config.rs @@ -2942,7 +2942,7 @@ pub struct SpillConfig { #[clap( long, value_name = "VALUE", - default_value = "./.databend/_cache/temp/_query_spill" + default_value = "./.databend/temp/_query_spill" )] pub spill_local_disk_path: String, diff --git a/src/query/config/src/inner.rs b/src/query/config/src/inner.rs index bc2aec350256..9b508d9daf21 100644 --- a/src/query/config/src/inner.rs +++ b/src/query/config/src/inner.rs @@ -722,7 +722,7 @@ pub struct SpillConfig { impl Default for SpillConfig { fn default() -> Self { Self { - path: "./.databend/_cache/temp/_query_spill".to_string(), + path: "./.databend/temp/_query_spill".to_string(), reserved_disk_ratio: OrderedFloat(0.3), global_bytes_limit: u64::MAX, } diff --git a/src/query/service/src/pipelines/builders/builder_window.rs b/src/query/service/src/pipelines/builders/builder_window.rs index d94f4f9eaf94..0bddf1fb2aed 100644 --- a/src/query/service/src/pipelines/builders/builder_window.rs +++ b/src/query/service/src/pipelines/builders/builder_window.rs @@ -173,7 +173,6 @@ impl PipelineBuilder { let disk_bytes_limit = settings.get_window_partition_spilling_to_disk_bytes_limit()?; let temp_dir_manager = TempDirManager::instance(); - temp_dir_manager.dir_xxxx()?; let disk_spill = temp_dir_manager.get_disk_spill_dir(disk_bytes_limit, &self.ctx.get_id()); let window_spill_settings = WindowSpillSettings::new(&settings, num_processors)?; diff --git a/src/query/storages/common/cache/src/temp_dir.rs b/src/query/storages/common/cache/src/temp_dir.rs index 2f80ea91a42a..ad23c6a06657 100644 --- a/src/query/storages/common/cache/src/temp_dir.rs +++ b/src/query/storages/common/cache/src/temp_dir.rs @@ -57,7 +57,23 @@ impl TempDirManager { .join(tenant_id) .into_boxed_path(); - (Some(path), 0) + if let Err(e) = remove_dir_all(&path) { + if !matches!(e.kind(), ErrorKind::NotFound) { + return Err(ErrorCode::StorageUnavailable(format!( + "can't clean temp dir: {e}", + ))); + } + } + + if create_dir_all(&path).is_err() { + (None, 0) + } else { + let stat = + statvfs(path.as_ref()).map_err(|e| ErrorCode::StorageOther(e.to_string()))?; + let reserved = (stat.f_blocks as f64 * *config.reserved_disk_ratio) as u64; + + (Some(path), reserved) + } }; GlobalInstance::set(Arc::new(Self { @@ -71,30 +87,6 @@ impl TempDirManager { Ok(()) } - pub fn dir_xxxx(&self) -> Result<()> { - let Some(path) = &self.root else { - return Err(ErrorCode::Internal(format!("empty root"))); - }; - - if let Err(e) = remove_dir_all(path) { - if !matches!(e.kind(), ErrorKind::NotFound) { - return Err(ErrorCode::Internal(format!( - "remove_dir_all {:?} {}", - path, e, - ))); - } - } - - create_dir_all(path) - .map_err(|e| ErrorCode::Internal(format!("create_dir_all {:?} {}", path, e)))?; - - let _stat = statvfs(path.as_ref()) - .map_err(|e| ErrorCode::Internal(format!("statvfs {:?} {}", path, e)))?; - // let reserved = (stat.f_blocks as f64 * *config.reserved_disk_ratio) as u64; - - Ok(()) - } - pub fn instance() -> Arc { GlobalInstance::get() } @@ -334,87 +326,84 @@ impl Drop for InnerPath { #[cfg(test)] mod tests { - // use std::assert_matches::assert_matches; - // use std::fs; - // use std::sync::atomic::Ordering; - - // use super::*; - - // #[test] - // fn test_temp_dir() -> Result<()> { - // let thread = std::thread::current(); - // GlobalInstance::init_testing(thread.name().unwrap()); - - // let config = SpillConfig { - // path: "test_data".to_string(), - // reserved_disk_ratio: 0.01.into(), - // global_bytes_limit: 1 << 30, - // }; + use std::assert_matches::assert_matches; + use std::fs; + use std::sync::atomic::Ordering; - // TempDirManager::init(&config, "test_tenant")?; + use super::*; - // let mgr = TempDirManager::instance(); - // let dir = mgr.get_disk_spill_dir(1 << 30, "some_query").unwrap(); - // let path = dir.new_file_with_size(100)?.unwrap(); + #[test] + fn test_temp_dir() -> Result<()> { + let thread = std::thread::current(); + GlobalInstance::init_testing(thread.name().unwrap()); + + let config = SpillConfig { + path: "test_data".to_string(), + reserved_disk_ratio: 0.01.into(), + global_bytes_limit: 1 << 30, + }; - // println!("{:?}", &path); + TempDirManager::init(&config, "test_tenant")?; - // fs::write(&path, vec![b'a'; 100])?; + let mgr = TempDirManager::instance(); + let dir = mgr.get_disk_spill_dir(1 << 30, "some_query").unwrap(); + let path = dir.new_file_with_size(100)?.unwrap(); - // assert_eq!(1, dir.dir_info.count.load(Ordering::Relaxed)); - // assert_eq!(100, *dir.dir_info.size.lock().unwrap()); + println!("{:?}", &path); - // let path_str = path.as_ref().to_str().unwrap().to_string(); - // drop(path); + fs::write(&path, vec![b'a'; 100])?; - // assert_eq!(0, dir.dir_info.count.load(Ordering::Relaxed)); - // assert_eq!(0, *dir.dir_info.size.lock().unwrap()); + assert_eq!(1, dir.dir_info.count.load(Ordering::Relaxed)); + assert_eq!(100, *dir.dir_info.size.lock().unwrap()); - // assert_matches!(fs::read_to_string(path_str), Err(_)); + let path_str = path.as_ref().to_str().unwrap().to_string(); + drop(path); - // mgr.drop_disk_spill_dir("some_query")?; + assert_eq!(0, dir.dir_info.count.load(Ordering::Relaxed)); + assert_eq!(0, *dir.dir_info.size.lock().unwrap()); - // remove_dir_all("test_data")?; + assert_matches!(fs::read_to_string(path_str), Err(_)); - // Ok(()) - // } + mgr.drop_disk_spill_dir("some_query")?; - // #[test] - // fn test_drop_disk_spill_dir_unknown() -> Result<()> { - // let thread = std::thread::current(); - // GlobalInstance::init_testing(thread.name().unwrap()); + remove_dir_all("test_data")?; - // let config = SpillConfig { - // path: "test_data2".to_string(), - // reserved_disk_ratio: 0.99.into(), - // global_bytes_limit: 1 << 30, - // }; + Ok(()) + } - // TempDirManager::init(&config, "test_tenant")?; + #[test] + fn test_drop_disk_spill_dir_unknown() -> Result<()> { + let thread = std::thread::current(); + GlobalInstance::init_testing(thread.name().unwrap()); + + let config = SpillConfig { + path: "test_data2".to_string(), + reserved_disk_ratio: 0.99.into(), + global_bytes_limit: 1 << 30, + }; - // let mgr = TempDirManager::instance(); - // mgr.get_disk_spill_dir(1 << 30, "some_query").unwrap(); + TempDirManager::init(&config, "test_tenant")?; - // create_dir("test_data2/test_tenant/unknown_query1")?; - // create_dir("test_data2/test_tenant/unknown_query2")?; + let mgr = TempDirManager::instance(); + mgr.get_disk_spill_dir(1 << 30, "some_query").unwrap(); - // let mut deleted = mgr.drop_disk_spill_dir_unknown(10)?; + create_dir("test_data2/test_tenant/unknown_query1")?; + create_dir("test_data2/test_tenant/unknown_query2")?; - // deleted.sort(); + let mut deleted = mgr.drop_disk_spill_dir_unknown(10)?; - // assert_eq!( - // vec![ - // PathBuf::from("test_data2/test_tenant/unknown_query1").into_boxed_path(), - // PathBuf::from("test_data2/test_tenant/unknown_query2").into_boxed_path(), - // ], - // deleted - // ); + deleted.sort(); - // remove_dir_all("test_data2")?; + assert_eq!( + vec![ + PathBuf::from("test_data2/test_tenant/unknown_query1").into_boxed_path(), + PathBuf::from("test_data2/test_tenant/unknown_query2").into_boxed_path(), + ], + deleted + ); - // Ok(()) - // } + remove_dir_all("test_data2")?; - #[test] - fn test() {} + Ok(()) + } } From db0135e732d06be17d51872e8dcf99a6fc33bf5f Mon Sep 17 00:00:00 2001 From: coldWater Date: Tue, 8 Oct 2024 13:46:42 +0800 Subject: [PATCH 37/40] spill_local_disk_path Signed-off-by: coldWater --- src/query/config/src/config.rs | 53 ++++++++++++++----- src/query/config/src/inner.rs | 5 +- .../storages/common/cache/src/temp_dir.rs | 5 +- 3 files changed, 46 insertions(+), 17 deletions(-) diff --git a/src/query/config/src/config.rs b/src/query/config/src/config.rs index c1a9dfab71bb..8bb8c1fbe2d5 100644 --- a/src/query/config/src/config.rs +++ b/src/query/config/src/config.rs @@ -15,6 +15,7 @@ use std::collections::BTreeMap; use std::collections::HashMap; use std::env; +use std::ffi::OsString; use std::fmt; use std::fmt::Debug; use std::fmt::Formatter; @@ -2944,7 +2945,7 @@ pub struct SpillConfig { value_name = "VALUE", default_value = "./.databend/temp/_query_spill" )] - pub spill_local_disk_path: String, + pub spill_local_disk_path: OsString, #[clap(long, value_name = "VALUE", default_value = "30")] /// Percentage of reserve disk space that won't be used for spill to local disk. @@ -2956,6 +2957,8 @@ pub struct SpillConfig { } mod cache_config_converters { + use std::path::PathBuf; + use log::warn; use super::*; @@ -2988,31 +2991,55 @@ mod cache_config_converters { type Error = ErrorCode; fn try_into(self) -> Result { + let Config { + subcommand, + config_file, + query, + log, + meta, + storage, + catalog, + cache, + mut spill, + background, + catalogs: input_catalogs, + .. + } = self; + let mut catalogs = HashMap::new(); - for (k, v) in self.catalogs.into_iter() { + for (k, v) in input_catalogs.into_iter() { let catalog = v.try_into()?; catalogs.insert(k, catalog); } - if !self.catalog.address.is_empty() || !self.catalog.protocol.is_empty() { + if !catalog.address.is_empty() || !catalog.protocol.is_empty() { warn!( "`catalog` is planned to be deprecated, please add catalog in `catalogs` instead" ); - let hive = self.catalog.try_into()?; + let hive = catalog.try_into()?; let catalog = InnerCatalogConfig::Hive(hive); catalogs.insert(CATALOG_HIVE.to_string(), catalog); } + // Trick for cloud, perhaps we should introduce a new configuration for the local writeable root. + if cache.disk_cache_config.path != inner::DiskCacheConfig::default().path + && spill.spill_local_disk_path == inner::SpillConfig::default().path + { + spill.spill_local_disk_path = PathBuf::from(spill.spill_local_disk_path) + .join("temp/_query_spill") + .into(); + }; + Ok(InnerConfig { - subcommand: self.subcommand, - config_file: self.config_file, - query: self.query.try_into()?, - log: self.log.try_into()?, - meta: self.meta.try_into()?, - storage: self.storage.try_into()?, + subcommand, + config_file, + query: query.try_into()?, + log: log.try_into()?, + meta: meta.try_into()?, + storage: storage.try_into()?, catalogs, - cache: self.cache.try_into()?, - spill: self.spill.try_into()?, - background: self.background.try_into()?, + cache: cache.try_into()?, + spill: spill.try_into()?, + background: background.try_into()?, }) } } diff --git a/src/query/config/src/inner.rs b/src/query/config/src/inner.rs index 9b508d9daf21..fb4a32c8afa2 100644 --- a/src/query/config/src/inner.rs +++ b/src/query/config/src/inner.rs @@ -13,6 +13,7 @@ // limitations under the License. use std::collections::HashMap; +use std::ffi::OsString; use std::fmt; use std::fmt::Debug; use std::fmt::Display; @@ -710,7 +711,7 @@ impl Default for CacheConfig { #[derive(Clone, Debug, PartialEq, Eq)] pub struct SpillConfig { /// Path of spill to local disk. disable if it's empty. - pub path: String, + pub path: OsString, /// Ratio of the reserve of the disk space. pub reserved_disk_ratio: OrderedFloat, @@ -722,7 +723,7 @@ pub struct SpillConfig { impl Default for SpillConfig { fn default() -> Self { Self { - path: "./.databend/temp/_query_spill".to_string(), + path: OsString::from("./.databend/temp/_query_spill"), reserved_disk_ratio: OrderedFloat(0.3), global_bytes_limit: u64::MAX, } diff --git a/src/query/storages/common/cache/src/temp_dir.rs b/src/query/storages/common/cache/src/temp_dir.rs index ad23c6a06657..5493e9faf400 100644 --- a/src/query/storages/common/cache/src/temp_dir.rs +++ b/src/query/storages/common/cache/src/temp_dir.rs @@ -327,6 +327,7 @@ impl Drop for InnerPath { #[cfg(test)] mod tests { use std::assert_matches::assert_matches; + use std::ffi::OsString; use std::fs; use std::sync::atomic::Ordering; @@ -338,7 +339,7 @@ mod tests { GlobalInstance::init_testing(thread.name().unwrap()); let config = SpillConfig { - path: "test_data".to_string(), + path: OsString::from("test_data"), reserved_disk_ratio: 0.01.into(), global_bytes_limit: 1 << 30, }; @@ -377,7 +378,7 @@ mod tests { GlobalInstance::init_testing(thread.name().unwrap()); let config = SpillConfig { - path: "test_data2".to_string(), + path: OsString::from("test_data2"), reserved_disk_ratio: 0.99.into(), global_bytes_limit: 1 << 30, }; From 57d539696550ce2a2ee085309f46441864581aeb Mon Sep 17 00:00:00 2001 From: coldWater Date: Tue, 8 Oct 2024 14:45:08 +0800 Subject: [PATCH 38/40] fix Signed-off-by: coldWater --- src/common/arrow/src/arrow/array/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/common/arrow/src/arrow/array/mod.rs b/src/common/arrow/src/arrow/array/mod.rs index 0b6aab51d50d..3f893ebaf15f 100644 --- a/src/common/arrow/src/arrow/array/mod.rs +++ b/src/common/arrow/src/arrow/array/mod.rs @@ -31,7 +31,7 @@ //! to a concrete struct based on [`PhysicalType`](crate::arrow::datatypes::PhysicalType) available from [`Array::data_type`]. //! All immutable arrays are backed by [`Buffer`](crate::arrow::buffer::Buffer) and thus cloning and slicing them is `O(1)`. //! -//! Most arrays contain a [`MutableArray`] counterpart that is neither clonable nor sliceable, but +//! Most arrays contain a [`MutableArray`] counterpart that is neither cloneable nor sliceable, but //! can be operated in-place. use std::any::Any; use std::sync::Arc; From 0020ee9a2e2ed88ce5e1713bde9aff46f3fd997c Mon Sep 17 00:00:00 2001 From: coldWater Date: Tue, 8 Oct 2024 14:58:25 +0800 Subject: [PATCH 39/40] fix Signed-off-by: coldWater --- Cargo.lock | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index c2f39de79ee8..496eb42ab184 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -8522,7 +8522,7 @@ dependencies = [ "httpdate", "itoa", "pin-project-lite", - "socket2 0.5.7", + "socket2 0.4.10", "tokio", "tower-service", "tracing", @@ -9555,7 +9555,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4979f22fdb869068da03c9f7528f8297c6fd2606bc3a4affe42e6a823fdb8da4" dependencies = [ "cfg-if", - "windows-targets 0.52.6", + "windows-targets 0.48.5", ] [[package]] @@ -10960,7 +10960,7 @@ dependencies = [ "serde_json", "thiserror", "tokio", - "tonic 0.12.2", + "tonic 0.12.3", ] [[package]] @@ -10974,7 +10974,7 @@ dependencies = [ "opentelemetry_sdk", "prost 0.13.1", "serde", - "tonic 0.12.2", + "tonic 0.12.3", ] [[package]] @@ -12065,7 +12065,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "18bec9b0adc4eba778b33684b7ba3e7137789434769ee3ce3930463ef904cfca" dependencies = [ "anyhow", - "itertools 0.13.0", + "itertools 0.10.5", "proc-macro2", "quote", "syn 2.0.58", @@ -12209,7 +12209,7 @@ dependencies = [ "indoc", "libc", "memoffset", - "parking_lot 0.12.3", + "parking_lot 0.11.2", "portable-atomic", "pyo3-build-config", "pyo3-ffi", @@ -15244,9 +15244,9 @@ dependencies = [ [[package]] name = "tokio-stream" -version = "0.1.15" +version = "0.1.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "267ac89e0bec6e691e5813911606935d77c476ff49024f98abcea3e7b15e37af" +checksum = "4f4e6ce100d0eb49a2734f8c0812bcd324cf357d21810932c5df6b96ef2b86f1" dependencies = [ "futures-core", "pin-project-lite", @@ -15379,9 +15379,9 @@ dependencies = [ [[package]] name = "tonic" -version = "0.12.2" +version = "0.12.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c6f6ba989e4b2c58ae83d862d3a3e27690b6e3ae630d0deb59f3697f32aa88ad" +checksum = "877c5b330756d856ffcc4553ab34a5684481ade925ecc54bcd1bf02b1d0d4d52" dependencies = [ "async-stream", "async-trait", @@ -15578,7 +15578,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "97fee6b57c6a41524a810daee9286c02d7752c4253064d0b05472833a438f675" dependencies = [ "cfg-if", - "rand 0.8.5", + "rand 0.7.3", "static_assertions", ] From 8b9aee77349cb6306991f33ccc648910f63661a8 Mon Sep 17 00:00:00 2001 From: coldWater Date: Tue, 8 Oct 2024 19:13:47 +0800 Subject: [PATCH 40/40] fix Signed-off-by: coldWater --- src/query/config/src/config.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/query/config/src/config.rs b/src/query/config/src/config.rs index 8bb8c1fbe2d5..ab1046e115c0 100644 --- a/src/query/config/src/config.rs +++ b/src/query/config/src/config.rs @@ -3024,7 +3024,7 @@ mod cache_config_converters { if cache.disk_cache_config.path != inner::DiskCacheConfig::default().path && spill.spill_local_disk_path == inner::SpillConfig::default().path { - spill.spill_local_disk_path = PathBuf::from(spill.spill_local_disk_path) + spill.spill_local_disk_path = PathBuf::from(&cache.disk_cache_config.path) .join("temp/_query_spill") .into(); };