From c1ef7ccaeab6c4fbf290974c197b78de1b23d49b Mon Sep 17 00:00:00 2001 From: Eugene Zhulenev Date: Fri, 31 Jan 2025 17:02:23 -0800 Subject: [PATCH] [xla:cpu] Delete unused timeslice parameter from parallel loop runner PiperOrigin-RevId: 721953216 --- .../cpu/runtime/parallel_loop_runner.cc | 10 +---- .../cpu/runtime/parallel_loop_runner.h | 9 +---- .../cpu/runtime/parallel_loop_runner_test.cc | 40 +++++-------------- .../cpu/runtime/xnnpack/xnn_fusion_thunk.cc | 4 +- 4 files changed, 15 insertions(+), 48 deletions(-) diff --git a/xla/backends/cpu/runtime/parallel_loop_runner.cc b/xla/backends/cpu/runtime/parallel_loop_runner.cc index 96c853641c68f..7a7176e53be05 100644 --- a/xla/backends/cpu/runtime/parallel_loop_runner.cc +++ b/xla/backends/cpu/runtime/parallel_loop_runner.cc @@ -21,13 +21,11 @@ limitations under the License. #include #include #include -#include #include #include "absl/base/attributes.h" #include "absl/base/optimization.h" #include "absl/log/check.h" -#include "absl/time/time.h" #include "xla/backends/cpu/runtime/work_queue.h" #include "xla/tsl/concurrency/async_value_ref.h" #include "xla/tsl/concurrency/chain.h" @@ -54,12 +52,8 @@ static tsl::AsyncValueRef OkDoneEventSingleton() { return singleton->AsRef(); } -ParallelLoopRunner::ParallelLoopRunner( - const Eigen::ThreadPoolDevice* device, - std::optional worker_timeslice) - : done_event_(OkDoneEventSingleton()), - device_(device), - worker_timeslice_(worker_timeslice) {} +ParallelLoopRunner::ParallelLoopRunner(const Eigen::ThreadPoolDevice* device) + : done_event_(OkDoneEventSingleton()), device_(device) {} tsl::AsyncValueRef ParallelLoopRunner::ResetDoneEvent() { auto done_event = std::move(done_event_); diff --git a/xla/backends/cpu/runtime/parallel_loop_runner.h b/xla/backends/cpu/runtime/parallel_loop_runner.h index 7d8d76d09ac3c..00aace69f8bce 100644 --- a/xla/backends/cpu/runtime/parallel_loop_runner.h +++ b/xla/backends/cpu/runtime/parallel_loop_runner.h @@ -21,7 +21,6 @@ limitations under the License. #include #include -#include "absl/time/time.h" #include "xla/tsl/concurrency/async_value_ref.h" #include "xla/tsl/concurrency/chain.h" @@ -59,9 +58,7 @@ namespace xla::cpu { // synchronized by the user. class ParallelLoopRunner { public: - explicit ParallelLoopRunner( - const Eigen::ThreadPoolDevice* device, - std::optional worker_timeslice = std::nullopt); + explicit ParallelLoopRunner(const Eigen::ThreadPoolDevice* device); // Takes ownership of the runner and returns a done event. After the done // event is transferred to the caller, it is illegal to schedule more parallel @@ -150,10 +147,6 @@ class ParallelLoopRunner { // pools for different NUMA nodes, and we have to be able to switch between // them from run to run. std::atomic device_; - - // The approximate amount of compute (in terms of wall time) that each - // persistent worker should handle. - std::optional worker_timeslice_; }; } // namespace xla::cpu diff --git a/xla/backends/cpu/runtime/parallel_loop_runner_test.cc b/xla/backends/cpu/runtime/parallel_loop_runner_test.cc index 82a9d00962614..0acbabecbfbdf 100644 --- a/xla/backends/cpu/runtime/parallel_loop_runner_test.cc +++ b/xla/backends/cpu/runtime/parallel_loop_runner_test.cc @@ -17,12 +17,10 @@ limitations under the License. #include #include -#include #include #include "absl/algorithm/container.h" #include "absl/cleanup/cleanup.h" -#include "absl/time/time.h" #include "absl/types/span.h" #include "xla/tsl/concurrency/async_value_ref.h" #include "xla/tsl/platform/env.h" @@ -36,14 +34,11 @@ limitations under the License. namespace xla::cpu { namespace { -class ParallelLoopRunnerTest - : public testing::TestWithParam> {}; - -TEST_P(ParallelLoopRunnerTest, Parallelize1D) { +TEST(ParallelLoopRunnerTest, Parallelize1D) { tsl::thread::ThreadPool threads(tsl::Env::Default(), "test", 8); Eigen::ThreadPoolDevice device(threads.AsEigenThreadPool(), threads.NumThreads()); - ParallelLoopRunner runner(&device, GetParam()); + ParallelLoopRunner runner(&device); constexpr int32_t d0 = 128; @@ -63,11 +58,11 @@ TEST_P(ParallelLoopRunnerTest, Parallelize1D) { [](int32_t value) { return value == 5; })); } -TEST_P(ParallelLoopRunnerTest, Parallelize1DTile1D) { +TEST(ParallelLoopRunnerTest, Parallelize1DTile1D) { tsl::thread::ThreadPool threads(tsl::Env::Default(), "test", 8); Eigen::ThreadPoolDevice device(threads.AsEigenThreadPool(), threads.NumThreads()); - ParallelLoopRunner runner(&device, GetParam()); + ParallelLoopRunner runner(&device); constexpr int32_t d0 = 128; @@ -91,11 +86,11 @@ TEST_P(ParallelLoopRunnerTest, Parallelize1DTile1D) { [](int32_t value) { return value == 5; })); } -TEST_P(ParallelLoopRunnerTest, Parallelize2DTile1D) { +TEST(ParallelLoopRunnerTest, Parallelize2DTile1D) { tsl::thread::ThreadPool threads(tsl::Env::Default(), "test", 8); Eigen::ThreadPoolDevice device(threads.AsEigenThreadPool(), threads.NumThreads()); - ParallelLoopRunner runner(&device, GetParam()); + ParallelLoopRunner runner(&device); constexpr int32_t d0 = 4; constexpr int32_t d1 = 39; @@ -120,11 +115,11 @@ TEST_P(ParallelLoopRunnerTest, Parallelize2DTile1D) { [](int32_t value) { return value == 5; })); } -TEST_P(ParallelLoopRunnerTest, Parallelize3DTile2D) { +TEST(ParallelLoopRunnerTest, Parallelize3DTile2D) { tsl::thread::ThreadPool threads(tsl::Env::Default(), "test", 8); Eigen::ThreadPoolDevice device(threads.AsEigenThreadPool(), threads.NumThreads()); - ParallelLoopRunner runner(&device, GetParam()); + ParallelLoopRunner runner(&device); constexpr int32_t d0 = 4; constexpr int32_t d1 = 39; @@ -153,13 +148,6 @@ TEST_P(ParallelLoopRunnerTest, Parallelize3DTile2D) { [](int32_t value) { return value == 5; })); } -INSTANTIATE_TEST_SUITE_P(ParallelLoopRunner, ParallelLoopRunnerTest, - testing::Values(std::nullopt, absl::Nanoseconds(100), - absl::Nanoseconds(500), - absl::Microseconds(1), - absl::Microseconds(10), - absl::Milliseconds(1))); - //===----------------------------------------------------------------------===// // Performance benchmarks. //===----------------------------------------------------------------------===// @@ -183,10 +171,7 @@ static void BM_Parallelize2DTile1D(benchmark::State& state) { Eigen::ThreadPoolDevice device(threads.AsEigenThreadPool(), threads.NumThreads()); - size_t timeslice = state.range(0); - ParallelLoopRunner runner( - &device, timeslice ? std::make_optional(absl::Nanoseconds(timeslice)) - : std::nullopt); + ParallelLoopRunner runner(&device); size_t range = 4; size_t tile = 1; @@ -204,10 +189,7 @@ static void BM_Parallelize3DTile2D(benchmark::State& state) { Eigen::ThreadPoolDevice device(threads.AsEigenThreadPool(), threads.NumThreads()); - size_t timeslice = state.range(0); - ParallelLoopRunner runner( - &device, timeslice ? std::make_optional(absl::Nanoseconds(timeslice)) - : std::nullopt); + ParallelLoopRunner runner(&device); size_t range = 4; size_t tile = 1; @@ -219,7 +201,7 @@ static void BM_Parallelize3DTile2D(benchmark::State& state) { } } -BENCHMARK(BM_Parallelize3DTile2D)->Arg(0)->Arg(100)->Arg(10000); +BENCHMARK(BM_Parallelize3DTile2D); } // namespace } // namespace xla::cpu diff --git a/xla/backends/cpu/runtime/xnnpack/xnn_fusion_thunk.cc b/xla/backends/cpu/runtime/xnnpack/xnn_fusion_thunk.cc index 606f85c551f09..0e42afef58cec 100644 --- a/xla/backends/cpu/runtime/xnnpack/xnn_fusion_thunk.cc +++ b/xla/backends/cpu/runtime/xnnpack/xnn_fusion_thunk.cc @@ -29,7 +29,6 @@ limitations under the License. #include "absl/memory/memory.h" #include "absl/status/statusor.h" #include "absl/strings/str_format.h" -#include "absl/time/time.h" #include "absl/types/span.h" #include "pthreadpool.h" #include "xla/backends/cpu/runtime/parallel_loop_runner.h" @@ -184,8 +183,7 @@ absl::StatusOr XnnFusionThunk::CreateXnnRuntime( // Configure XNNPACK runtime thread pool if parallelization is enabled. if (parallelization_mode == ParallelizationMode::kParallelLoopRunner) { - runtime.runner = std::make_unique( - device, /*worker_timeslice=*/absl::Microseconds(100)); + runtime.runner = std::make_unique(device); runtime.threadpool = CreateCustomPthreadpool(runtime.runner.get()); } else if (parallelization_mode == ParallelizationMode::kPThreadPool) { runtime.threadpool = DefaultPthreadpool();