Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] *: trace scan requests #427

Open
wants to merge 33 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
bec213a
trace scan requests
pingyu Oct 26, 2023
e35f479
wip
pingyu Oct 26, 2023
547eabb
wip
pingyu Oct 26, 2023
e35a71b
wip
pingyu Oct 27, 2023
942dbfc
wip
pingyu Oct 27, 2023
56e8f4d
fix blocking_write
pingyu Oct 30, 2023
75ecb2c
comment out block_on
pingyu Oct 30, 2023
83ae139
use AtomicU8 for status
pingyu Oct 31, 2023
34a7936
simplify
pingyu Oct 31, 2023
fdec181
fix check
pingyu Oct 31, 2023
a1afcc1
skip exchange on equal
pingyu Oct 31, 2023
1f9eba3
Merge branch 'status-as-atomic-u8' into trace-scan
pingyu Oct 31, 2023
74df9f3
fix check
pingyu Oct 31, 2023
63790aa
Merge remote-tracking branch 'upstream/master' into trace-scan
pingyu Nov 1, 2023
bab9d01
polish
pingyu Nov 1, 2023
7d8b777
trace tso
pingyu Nov 2, 2023
aabe8e5
fix get tso hang
pingyu Nov 3, 2023
7892d75
Merge remote-tracking branch 'upstream/master' into trace-scan
pingyu Nov 3, 2023
55887f4
change all log to tracing
pingyu Nov 3, 2023
a0bd83f
Merge remote-tracking branch 'origin/fix-get-tso-hang' into trace-scan
pingyu Nov 3, 2023
1d3d074
more trace for tso
pingyu Nov 4, 2023
7ba5f55
wake
pingyu Nov 6, 2023
81058ee
Merge branch 'fix-get-tso-hang' into trace-scan
pingyu Nov 6, 2023
e08fa53
Merge remote-tracking branch 'upstream/master' into trace-scan
pingyu Nov 13, 2023
faf135b
print locks
pingyu Nov 13, 2023
5591a9a
Merge branch 'master' into trace-scan
pingyu Nov 22, 2023
c2325e2
tracing for gc
pingyu Nov 22, 2023
e067c34
do not trace single shard
pingyu Nov 22, 2023
36be2d0
gc with range
pingyu Nov 22, 2023
20f51be
polish trace
pingyu Nov 22, 2023
2fb5c22
trace handle_region_error
pingyu Nov 22, 2023
87f5c5c
no trace ResolveLock::execute
pingyu Nov 22, 2023
5695e2e
migrate to tikv/minitrace-rust
andylokandy Dec 18, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,8 @@ either = "1.6"
fail = "0.4"
futures = { version = "0.3" }
lazy_static = "1"
log = "0.4"
log = { version = "0.4", features = ["kv_unstable"] }
minitrace = "0.6.2"
pin-project = "1"
prometheus = { version = "0.13", default-features = false }
prost = "0.12"
Expand Down
2 changes: 1 addition & 1 deletion src/kv/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ pub use key::Key;
pub use kvpair::KvPair;
pub use value::Value;

struct HexRepr<'a>(pub &'a [u8]);
pub struct HexRepr<'a>(pub &'a [u8]);

impl<'a> fmt::Display for HexRepr<'a> {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
Expand Down
2 changes: 2 additions & 0 deletions src/pd/cluster.rs
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,7 @@ impl Connection {
Connection { security_mgr }
}

#[minitrace::trace]
pub async fn connect_cluster(
&self,
endpoints: &[String],
Expand All @@ -122,6 +123,7 @@ impl Connection {
}

// Re-establish connection with PD leader in asynchronous fashion.
#[minitrace::trace]
pub async fn reconnect(&self, cluster: &mut Cluster, timeout: Duration) -> Result<()> {
warn!("updating pd client");
let start = Instant::now();
Expand Down
17 changes: 14 additions & 3 deletions src/pd/retry.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ use std::time::Duration;
use std::time::Instant;

use async_trait::async_trait;
use log::debug;
use minitrace::prelude::*;
use tokio::sync::RwLock;
use tokio::time::sleep;

Expand Down Expand Up @@ -74,14 +76,17 @@ macro_rules! retry_core {
($self: ident, $tag: literal, $call: expr) => {{
let stats = pd_stats($tag);
let mut last_err = Ok(());
for _ in 0..LEADER_CHANGE_RETRY {
for retry in 0..LEADER_CHANGE_RETRY {
let _span = LocalSpan::enter_with_local_parent("RetryClient::retry");

let res = $call;

match stats.done(res) {
Ok(r) => return Ok(r),
Err(e) => last_err = Err(e),
}

debug!("retry {} on last_err: {:?}", retry, last_err);
let mut reconnect_count = MAX_REQUEST_COUNT;
while let Err(e) = $self.reconnect(RECONNECT_INTERVAL_SEC).await {
reconnect_count -= 1;
Expand Down Expand Up @@ -142,6 +147,7 @@ impl RetryClient<Cluster> {
impl RetryClientTrait for RetryClient<Cluster> {
// These get_* functions will try multiple times to make a request, reconnecting as necessary.
// It does not know about encoding. Caller should take care of it.
#[minitrace::trace]
async fn get_region(self: Arc<Self>, key: Vec<u8>) -> Result<RegionWithLeader> {
retry_mut!(self, "get_region", |cluster| {
let key = key.clone();
Expand All @@ -156,6 +162,7 @@ impl RetryClientTrait for RetryClient<Cluster> {
})
}

#[minitrace::trace]
async fn get_region_by_id(self: Arc<Self>, region_id: RegionId) -> Result<RegionWithLeader> {
retry_mut!(self, "get_region_by_id", |cluster| async {
cluster
Expand All @@ -167,6 +174,7 @@ impl RetryClientTrait for RetryClient<Cluster> {
})
}

#[minitrace::trace]
async fn get_store(self: Arc<Self>, id: StoreId) -> Result<metapb::Store> {
retry_mut!(self, "get_store", |cluster| async {
cluster
Expand All @@ -176,6 +184,7 @@ impl RetryClientTrait for RetryClient<Cluster> {
})
}

#[minitrace::trace]
async fn get_all_stores(self: Arc<Self>) -> Result<Vec<metapb::Store>> {
retry_mut!(self, "get_all_stores", |cluster| async {
cluster
Expand All @@ -185,10 +194,12 @@ impl RetryClientTrait for RetryClient<Cluster> {
})
}

#[minitrace::trace]
async fn get_timestamp(self: Arc<Self>) -> Result<Timestamp> {
retry!(self, "get_timestamp", |cluster| cluster.get_timestamp())
}

#[minitrace::trace]
async fn update_safepoint(self: Arc<Self>, safepoint: u64) -> Result<bool> {
retry_mut!(self, "update_gc_safepoint", |cluster| async {
cluster
Expand Down Expand Up @@ -277,7 +288,7 @@ mod test {
}

async fn retry_ok(client: Arc<MockClient>) -> Result<()> {
retry!(client, "test", |_c| ready(Ok::<_, Error>(())))
retry_mut!(client, "test", |_c| ready(Ok::<_, Error>(())))
}

executor::block_on(async {
Expand Down Expand Up @@ -342,7 +353,7 @@ mod test {
client: Arc<MockClient>,
max_retries: Arc<AtomicUsize>,
) -> Result<()> {
retry!(client, "test", |c| {
retry_mut!(client, "test", |c| {
c.fetch_add(1, std::sync::atomic::Ordering::SeqCst);

let max_retries = max_retries.fetch_sub(1, Ordering::SeqCst) - 1;
Expand Down
19 changes: 19 additions & 0 deletions src/pd/timestamp.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ use futures::task::Context;
use futures::task::Poll;
use log::debug;
use log::info;
use minitrace::prelude::*;
use pin_project::pin_project;
use tokio::sync::mpsc;
use tokio::sync::oneshot;
Expand Down Expand Up @@ -63,6 +64,7 @@ impl TimestampOracle {
Ok(TimestampOracle { request_tx })
}

#[minitrace::trace]
pub(crate) async fn get_timestamp(self) -> Result<Timestamp> {
debug!("getting current timestamp");
let (request, response) = oneshot::channel();
Expand All @@ -74,6 +76,7 @@ impl TimestampOracle {
}
}

#[minitrace::trace]
async fn run_tso(
cluster_id: u64,
mut pd_client: PdClient<Channel>,
Expand All @@ -98,6 +101,9 @@ async fn run_tso(
let mut responses = pd_client.tso(request_stream).await?.into_inner();

while let Some(Ok(resp)) = responses.next().await {
let _span = LocalSpan::enter_with_local_parent("handle_response");
debug!("got response: {:?}", resp);

{
let mut pending_requests = pending_requests.lock().await;
allocate_timestamps(&resp, &mut pending_requests)?;
Expand Down Expand Up @@ -128,6 +134,7 @@ struct TsoRequestStream {
impl Stream for TsoRequestStream {
type Item = TsoRequest;

#[minitrace::trace]
fn poll_next(self: Pin<&mut Self>, cx: &mut Context) -> Poll<Option<Self::Item>> {
let mut this = self.project();

Expand All @@ -152,6 +159,12 @@ impl Stream for TsoRequestStream {
}
}

debug!(
"got requests: len {}, pending_requests {}",
requests.len(),
pending_requests.len()
);

if !requests.is_empty() {
let req = TsoRequest {
header: Some(RequestHeader {
Expand All @@ -168,6 +181,12 @@ impl Stream for TsoRequestStream {
};
pending_requests.push_back(request_group);

debug!(
"sending request to PD: {:?}, pending_requests {}",
req,
pending_requests.len()
);

Poll::Ready(Some(req))
} else {
// Set the waker to the context, then the stream can be waked up after the pending queue
Expand Down
9 changes: 9 additions & 0 deletions src/raw/requests.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
// Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0.

use std::any::Any;
use std::fmt::Formatter;
use std::ops::Range;
use std::sync::Arc;
use std::time::Duration;
Expand Down Expand Up @@ -404,6 +405,14 @@ impl Request for RawCoprocessorRequest {
}
}

impl std::fmt::Debug for RawCoprocessorRequest {
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
f.debug_struct("RawCoprocessorRequest")
.field("inner", &self.inner)
.finish()
}
}

impl KvRequest for RawCoprocessorRequest {
type Response = kvrpcpb::RawCoprocessorResponse;
}
Expand Down
2 changes: 1 addition & 1 deletion src/request/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ mod test {

impl HasLocks for MockRpcResponse {}

#[derive(Clone)]
#[derive(Debug, Clone)]
struct MockKvRequest {
test_invoking_count: Arc<AtomicUsize>,
}
Expand Down
55 changes: 45 additions & 10 deletions src/request/plan.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ use futures::future::try_join_all;
use futures::prelude::*;
use log::debug;
use log::info;
use minitrace::future::FutureExt;
use minitrace::prelude::*;
use tokio::sync::Semaphore;
use tokio::time::sleep;

Expand Down Expand Up @@ -57,6 +59,7 @@ pub struct Dispatch<Req: KvRequest> {
impl<Req: KvRequest> Plan for Dispatch<Req> {
type Result = Req::Response;

#[minitrace::trace]
async fn execute(&self) -> Result<Self::Result> {
let stats = tikv_stats(self.request.label());
let result = self
Expand Down Expand Up @@ -104,6 +107,7 @@ where
{
// A plan may involve multiple shards
#[async_recursion]
#[minitrace::trace]
async fn single_plan_handler(
pd_client: Arc<PdC>,
current_plan: P,
Expand All @@ -117,14 +121,17 @@ where
let (shard, region_store) = shard?;
let mut clone = current_plan.clone();
clone.apply_shard(shard, &region_store)?;
let handle = tokio::spawn(Self::single_shard_handler(
pd_client.clone(),
clone,
region_store,
backoff.clone(),
permits.clone(),
preserve_region_results,
));
let handle = tokio::spawn(
Self::single_shard_handler(
pd_client.clone(),
clone,
region_store,
backoff.clone(),
permits.clone(),
preserve_region_results,
)
.in_span(Span::enter_with_local_parent("single_shard_handler")),
);
handles.push(handle);
}

Expand All @@ -149,6 +156,7 @@ where
}

#[async_recursion]
#[minitrace::trace]
async fn single_shard_handler(
pd_client: Arc<PdC>,
plan: P,
Expand Down Expand Up @@ -210,11 +218,17 @@ where
// 1. Ok(true): error has been resolved, retry immediately
// 2. Ok(false): backoff, and then retry
// 3. Err(Error): can't be resolved, return the error to upper level
#[minitrace::trace]
async fn handle_region_error(
pd_client: Arc<PdC>,
e: errorpb::Error,
region_store: RegionStore,
) -> Result<bool> {
debug!(
"handle_region_error, error:{:?}, region_store:{:?}",
e, region_store
);

let ver_id = region_store.region_with_leader.ver_id();
if let Some(not_leader) = e.not_leader {
if let Some(leader) = not_leader.leader {
Expand Down Expand Up @@ -266,6 +280,7 @@ where
// 1. Ok(true): error has been resolved, retry immediately
// 2. Ok(false): backoff, and then retry
// 3. Err(Error): can't be resolved, return the error to upper level
#[minitrace::trace]
async fn on_region_epoch_not_match(
pd_client: Arc<PdC>,
region_store: RegionStore,
Expand Down Expand Up @@ -302,6 +317,7 @@ where
Ok(false)
}

#[minitrace::trace]
async fn handle_grpc_error(
pd_client: Arc<PdC>,
plan: P,
Expand Down Expand Up @@ -349,6 +365,7 @@ where
{
type Result = Vec<Result<P::Result>>;

#[minitrace::trace]
async fn execute(&self) -> Result<Self::Result> {
// Limit the maximum concurrency of multi-region request. If there are
// too many concurrent requests, TiKV is more likely to return a "TiKV
Expand Down Expand Up @@ -469,6 +486,7 @@ impl<In: Clone + Send + Sync + 'static, P: Plan<Result = Vec<Result<In>>>, M: Me
{
type Result = M::Out;

#[minitrace::trace]
async fn execute(&self) -> Result<Self::Result> {
self.merge.merge(self.inner.execute().await?)
}
Expand Down Expand Up @@ -565,27 +583,43 @@ where
{
type Result = P::Result;

#[minitrace::trace]
async fn execute(&self) -> Result<Self::Result> {
let mut result = self.inner.execute().await?;
let mut clone = self.clone();
let mut retry_cnt = 0;
loop {
retry_cnt += 1;
let _span = LocalSpan::enter_with_local_parent("ResolveLock::execute::retry")
.with_property(|| ("retry_count", retry_cnt.to_string()));

let locks = result.take_locks();
if locks.is_empty() {
debug!("ResolveLock::execute ok");
return Ok(result);
}

if self.backoff.is_none() {
debug!("ResolveLock::execute lock error");
return Err(Error::ResolveLockError(locks));
}

let pd_client = self.pd_client.clone();
let live_locks = resolve_locks(locks, pd_client.clone()).await?;
if live_locks.is_empty() {
debug!("ResolveLock::execute lock error retry (resolved)",);
result = self.inner.execute().await?;
} else {
match clone.backoff.next_delay_duration() {
None => return Err(Error::ResolveLockError(live_locks)),
None => {
debug!("ResolveLock::execute lock error");
return Err(Error::ResolveLockError(live_locks));
}
Some(delay_duration) => {
debug!(
"ResolveLock::execute lock error retry (delay {:?})",
delay_duration
);
sleep(delay_duration).await;
result = clone.inner.execute().await?;
}
Expand All @@ -595,7 +629,7 @@ where
}
}

#[derive(Default)]
#[derive(Default, Debug)]
pub struct CleanupLocksResult {
pub region_error: Option<errorpb::Error>,
pub key_error: Option<Vec<Error>>,
Expand Down Expand Up @@ -667,6 +701,7 @@ where
{
type Result = CleanupLocksResult;

#[minitrace::trace]
async fn execute(&self) -> Result<Self::Result> {
let mut result = CleanupLocksResult::default();
let mut inner = self.inner.clone();
Expand Down
Loading