Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Integrate work dispatchers into our drivers #211

Open
wants to merge 6 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 10 additions & 9 deletions include/neml2/dispatcher/SimpleScheduler.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,9 @@
#pragma once

#include "neml2/dispatcher/WorkScheduler.h"
#include "neml2/base/Registry.h"
#include "neml2/base/NEML2Object.h"
#include "neml2/base/Factory.h"

namespace neml2
{
Expand All @@ -40,18 +43,15 @@ namespace neml2
class SimpleScheduler : public WorkScheduler
{
public:
/// Options for the scheduler
static OptionSet expected_options();

/**
* @brief Construct a new Simple Scheduler object
* @brief Construct a new WorkScheduler object
*
* @param device Device on which to dispatch
* @param batch_size The (fixed) batch size to dispatch each time
* @param capacity The capacity of the device, i.e., the maximum number of work batches that can
* be simultaneously handled by the device at any given time. The default capacity is set to the
* maximum value of size_t
* @param options Options for the scheduler
*/
SimpleScheduler(torch::Device device,
std::size_t batch_size,
std::size_t capacity = std::numeric_limits<std::size_t>::max());
SimpleScheduler(const OptionSet & options);

bool schedule_work(torch::Device &, std::size_t &) const override;

Expand All @@ -72,4 +72,5 @@ class SimpleScheduler : public WorkScheduler
/// Current load on the device
std::size_t _load = 0;
};

} // namespace neml2
67 changes: 37 additions & 30 deletions include/neml2/dispatcher/StaticHybridScheduler.h
Original file line number Diff line number Diff line change
Expand Up @@ -58,8 +58,38 @@ class StaticHybridScheduler : public WorkScheduler
std::size_t load;
};

/// Options for the scheduler
static OptionSet expected_options();

/// @brief Construct from options
/// @param options
StaticHybridScheduler(const OptionSet & options);

/**
* @brief Pick the next device to dispatch work to
*
* The function returns the device and the number of batches to dispatch. The device is chosen
* based on the availability of the available devices. A device is said to be available if (load +
* batch_size) <= capacity. If multiple devices are available, the device with the highest
* availability will be chosen.
*
* By default, the availability is the device's priority, a custom function can be set using
* set_availability_calculator().
*/
bool schedule_work(torch::Device &, std::size_t &) const override;

/// Set a custom availability calculator
void set_availability_calculator(std::function<double(const DeviceStatus &)>);

void dispatched_work(torch::Device, std::size_t) override;

void completed_work(torch::Device, std::size_t) override;

const std::vector<DeviceStatus> & status() const { return _devices; }

private:
/**
* The constructor takes a device list, along with the batch sizes, capacities, and priorities for
* The initializer takes a device list, along with the batch sizes, capacities, and priorities for
* each device.
*
* The device list should be unique and non-empty. torch::kCPU can appear at most once. When
Expand All @@ -86,38 +116,15 @@ class StaticHybridScheduler : public WorkScheduler
* one of a certain type. The device index is optional, and in its defaulted state represents
* (abstractly) "the current device". Further, there are two constraints on the value of the
* device index, if one is explicitly stored:
* 1. A negative index represents the current device, a non-negative index
* 0. A negative index represents the current device, a non-negative index
* represents a specific, concrete device,
* 2. When the device type is CPU, the device index must be zero.
*/
StaticHybridScheduler(const std::vector<torch::Device> & device_list,
const std::vector<std::size_t> & batch_sizes,
const std::vector<std::size_t> & capacities = {},
const std::vector<double> & priorities = {});

/**
* @brief Pick the next device to dispatch work to
*
* The function returns the device and the number of batches to dispatch. The device is chosen
* based on the availability of the available devices. A device is said to be available if (load +
* batch_size) <= capacity. If multiple devices are available, the device with the highest
* availability will be chosen.
*
* By default, the availability is the device's priority, a custom function can be set using
* set_availability_calculator().
* 1. When the device type is CPU, the device index must be zero.
*/
bool schedule_work(torch::Device &, std::size_t &) const override;

/// Set a custom availability calculator
void set_availability_calculator(std::function<double(const DeviceStatus &)>);

void dispatched_work(torch::Device, std::size_t) override;

void completed_work(torch::Device, std::size_t) override;
void setup(const std::vector<torch::Device> & device_list,
const std::vector<std::size_t> & batch_sizes,
const std::vector<std::size_t> & capacities,
const std::vector<double> & priorities);

const std::vector<DeviceStatus> & status() const { return _devices; }

private:
/// Whether a CPU device is specified
bool _cpu = false;

Expand Down
13 changes: 12 additions & 1 deletion include/neml2/dispatcher/WorkScheduler.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@

#pragma once

#include "neml2/base/NEML2Object.h"
#include "neml2/misc/types.h"

namespace neml2
Expand All @@ -39,9 +40,19 @@ namespace neml2
*
* @see WorkGenerator, WorkDispatcher
*/
class WorkScheduler
class WorkScheduler : public NEML2Object
{
public:
/// Options for the scheduler
static OptionSet expected_options();

/**
* @brief Construct a new WorkScheduler object
*
* @param options Options for the scheduler
*/
WorkScheduler(const OptionSet & options);

/**
* @brief Determine the device and batch size for the next dispatch
*
Expand Down
45 changes: 45 additions & 0 deletions include/neml2/dispatcher/valuemap_helpers.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
// Copyright 2024, UChicago Argonne, LLC
// All Rights Reserved
// Software Name: NEML2 -- the New Engineering material Model Library, version 2
// By: Argonne National Laboratory
// OPEN SOURCE LICENSE (MIT)
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.

#pragma once

#include "neml2/models/map_types.h"

namespace neml2
{
/// @brief Concatenate the tensors in the ValueMap along the batch dimension
/// @param results The results to concatenate
/// @param batch_dim The batch dimension along which to concatenate
/// @return ValueMap with the tensors concatenated along the batch dimension
ValueMap valuemap_cat_reduce(std::vector<ValueMap> && results, Size batch_dim);

/// @brief Move all tensors in a ValueMap to a device
/// @param x input ValueMap
/// @param device target device
/// @return ValueMap with all tensors moved
ValueMap valuemap_move_device(ValueMap && x, torch::Device device);

/// @brief No operation
ValueMap valuemap_no_operation(ValueMap && x);
}
2 changes: 2 additions & 0 deletions include/neml2/drivers/Driver.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
#include "neml2/base/Factory.h"
#include "neml2/base/DiagnosticsInterface.h"
#include "neml2/models/Model.h"
#include "neml2/dispatcher/WorkScheduler.h"
#include <filesystem>

// The following are not directly used by Solver itself.
Expand Down Expand Up @@ -62,5 +63,6 @@ class Driver : public NEML2Object, public DiagnosticsInterface
protected:
/// Whether to print out additional (debugging) information during the execution.
bool _verbose;
std::shared_ptr<WorkScheduler> _scheduler;
};
} // namespace neml2
1 change: 1 addition & 0 deletions src/neml2/base/HITParser.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,7 @@ HITParser::extract_option(hit::Node * n, OptionSet & options) const
extract_option_t(bool);
extract_option_t(int);
extract_option_t(unsigned int);
extract_option_t(std::size_t);
extract_option_t(Size);
extract_option_t(Real);
extract_option_t(std::string);
Expand Down
2 changes: 1 addition & 1 deletion src/neml2/base/Parser.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -29,5 +29,5 @@
namespace neml2
{
const std::vector<std::string> Parser::sections = {
"Tensors", "Solvers", "Data", "Models", "Drivers"};
"Tensors", "Solvers", "Data", "Models", "Drivers", "Schedulers"};
} // namespace neml2
35 changes: 29 additions & 6 deletions src/neml2/dispatcher/SimpleScheduler.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,32 @@

namespace neml2
{
SimpleScheduler::SimpleScheduler(torch::Device device, std::size_t batch_size, std::size_t capacity)
: _device(device),
_batch_size(batch_size),
_capacity(capacity)

register_NEML2_object(SimpleScheduler);

OptionSet
SimpleScheduler::expected_options()
{
OptionSet options = WorkScheduler::expected_options();
options.doc() = "Dispatch work to a single device in given batch std::size_ts.";

options.set<std::string>("device");
options.set("device").doc() = "Torch device to run on";

options.set<std::size_t>("batch_size");
options.set("batch_size").doc() = "Batch size";

options.set<std::size_t>("capacity") = std::numeric_limits<std::size_t>::max();
options.set("capacity").doc() = "Maximum number of work items that can be dispatched";

return options;
}

SimpleScheduler::SimpleScheduler(const OptionSet & options)
: WorkScheduler(options),
_device(torch::Device(options.get<std::string>("device"))),
_batch_size(options.get<std::size_t>("batch_size")),
_capacity(options.get<std::size_t>("capacity"))
{
}

Expand All @@ -46,15 +68,16 @@ SimpleScheduler::schedule_work(torch::Device & device, std::size_t & batch_size)
}

void
SimpleScheduler::dispatched_work(torch::Device, std::size_t n)
SimpleScheduler::dispatched_work(torch::Device, size_t n)
{
_load += n;
}

void
SimpleScheduler::completed_work(torch::Device, std::size_t n)
SimpleScheduler::completed_work(torch::Device, size_t n)
{
neml_assert(_load >= n, "Load underflow");
_load -= n;
}

} // namespace neml2
46 changes: 42 additions & 4 deletions src/neml2/dispatcher/StaticHybridScheduler.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -24,13 +24,51 @@

#include "neml2/dispatcher/StaticHybridScheduler.h"
#include "neml2/misc/error.h"
#include "neml2/base/Registry.h"

namespace neml2
{
StaticHybridScheduler::StaticHybridScheduler(const std::vector<torch::Device> & device_list,
const std::vector<std::size_t> & batch_sizes,
const std::vector<std::size_t> & capacities,
const std::vector<double> & priorities)
register_NEML2_object(StaticHybridScheduler);

OptionSet
StaticHybridScheduler::expected_options()
{
OptionSet options = WorkScheduler::expected_options();
options.doc() = "Dispatch work to multiple devices based on provided batch sizes and priorities.";

options.set<std::vector<std::string>>("devices");
options.set("devices").doc() = "List of devices to dispatch work to";

options.set<std::vector<std::size_t>>("batch_sizes");
options.set("batch_sizes").doc() = "List of batch sizes for each device";

options.set<std::vector<std::size_t>>("capacities") = {};
options.set("capacities").doc() = "List of capacities for each device";

options.set<std::vector<double>>("priorities") = {};
options.set("priorities").doc() = "List of priorities for each device";

return options;
}

StaticHybridScheduler::StaticHybridScheduler(const OptionSet & options)
: WorkScheduler(options)
{
std::vector<torch::Device> devices;
for (const auto & device : options.get<std::vector<std::string>>("devices"))
devices.emplace_back(torch::Device(device));

setup(devices,
options.get<std::vector<std::size_t>>("batch_sizes"),
options.get<std::vector<std::size_t>>("capacities"),
options.get<std::vector<double>>("priorities"));
}

void
StaticHybridScheduler::setup(const std::vector<torch::Device> & device_list,
const std::vector<std::size_t> & batch_sizes,
const std::vector<std::size_t> & capacities,
const std::vector<double> & priorities)
{
// First pass:
// - Check if any CPU device is present
Expand Down
2 changes: 1 addition & 1 deletion src/neml2/dispatcher/ValueMapLoader.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -71,4 +71,4 @@ ValueMapLoader::generate(std::size_t n)

return {m, std::move(work)};
}
} // namespace neml2
} // namespace neml2
44 changes: 44 additions & 0 deletions src/neml2/dispatcher/WorkScheduler.cxx
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
// Copyright 2024, UChicago Argonne, LLC
// All Rights Reserved
// Software Name: NEML2 -- the New Engineering material Model Library, version 2
// By: Argonne National Laboratory
// OPEN SOURCE LICENSE (MIT)
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.

#include "neml2/dispatcher/WorkScheduler.h"

namespace neml2
{

OptionSet
WorkScheduler::expected_options()
{
OptionSet options = NEML2Object::expected_options();
options.doc() = "Schedules work to different devices.";

return options;
}

WorkScheduler::WorkScheduler(const OptionSet & options)
: NEML2Object(options)
{
}

}
Loading
Loading