Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Unroll the scalar loops to improve performance #2

Draft
wants to merge 1 commit into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions NetFabric.ForEachEx.Benchmarks/ForEachBenchmarks.cs
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,13 @@ public class ForEachBenchmarks
List<int>? list;
int[]? array;

[Params(10, 1_000)]
[Params(10, 10_000)]
public int Count { get; set; }

[GlobalSetup]
public void GlobalSetup()
{
enumerable = Utils.GetEnumerable(Count);
enumerable = Utils.GetEnumerable(Count, 100);
list = enumerable.ToList();
array = enumerable.ToArray();
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,13 @@ public class ForEachExEnumerableBenchmarks
{
int[]? array;

[Params(1_000)]
[Params(10_000)]
public int Count { get; set; }

[GlobalSetup]
public void GlobalSetup()
{
var enumerable = Utils.GetEnumerable(Count);
var enumerable = Utils.GetEnumerable(Count, 100);
array = enumerable.ToArray();
}

Expand Down
4 changes: 2 additions & 2 deletions NetFabric.ForEachEx.Benchmarks/ForEachVectorExBenchmarks.cs
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,13 @@ public class ForEachVectorExBenchmarks
{
int[]? array;

[Params(10, 1_000)]
[Params(10, 10_000)]
public int Count { get; set; }

[GlobalSetup]
public void GlobalSetup()
{
var enumerable = Utils.GetEnumerable(Count);
var enumerable = Utils.GetEnumerable(Count, 100);
array = enumerable.ToArray();
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

<PropertyGroup>
<OutputType>Exe</OutputType>
<TargetFramework>net8.0</TargetFramework>
<TargetFramework>net7.0</TargetFramework>
<ImplicitUsings>enable</ImplicitUsings>
<Nullable>enable</Nullable>
<AllowUnsafeBlocks>true</AllowUnsafeBlocks>
Expand Down
40 changes: 38 additions & 2 deletions NetFabric.ForEachEx.Benchmarks/Program.cs
Original file line number Diff line number Diff line change
@@ -1,13 +1,49 @@
using BenchmarkDotNet.Columns;
using System.Runtime.Intrinsics;
using BenchmarkDotNet.Columns;
using BenchmarkDotNet.Configs;
using BenchmarkDotNet.Diagnosers;
using BenchmarkDotNet.Environments;
using BenchmarkDotNet.Exporters;
using BenchmarkDotNet.Jobs;
using BenchmarkDotNet.Reports;
using BenchmarkDotNet.Running;
using Perfolizer.Horology;

var net70 = Job.Default
.WithRuntime(CoreRuntime.Core70)
.WithWarmupCount(1)
.WithIterationTime(TimeInterval.FromSeconds(0.25))
.WithMaxIterationCount(20);

var net80 = Job.Default
.WithRuntime(CoreRuntime.Core80)
.WithWarmupCount(1)
.WithIterationTime(TimeInterval.FromSeconds(0.25))
.WithMaxIterationCount(20);

var config = DefaultConfig.Instance
.WithSummaryStyle(SummaryStyle.Default.WithRatioStyle(RatioStyle.Trend))
.HideColumns(Column.Runtime, Column.EnvironmentVariables, Column.RatioSD, Column.Error)
.AddDiagnoser(MemoryDiagnoser.Default)
.AddExporter(MarkdownExporter.GitHub);
// .AddDiagnoser(new DisassemblyDiagnoser(new DisassemblyDiagnoserConfig
// (exportGithubMarkdown: true, printInstructionAddresses: false)))
.AddJob(net70.WithEnvironmentVariable("DOTNET_EnableHWIntrinsic", "0").WithId(".NET 7 Scalar").AsBaseline())
.AddJob(net80.WithEnvironmentVariable("DOTNET_EnableHWIntrinsic", "0").WithId(".NET 8 Scalar"));

if (Vector256.IsHardwareAccelerated)
{
config = config
.AddJob(net70.WithId(".NET 7 Vector256"))
.AddJob(net80.WithId(".NET 8 Vector256"))
.AddJob(net70.WithEnvironmentVariable("DOTNET_EnableAVX2", "0").WithId(".NET 7 Vector128"))
.AddJob(net80.WithEnvironmentVariable("DOTNET_EnableAVX2", "0").WithId(".NET 8 Vector128"));

}
else if (Vector128.IsHardwareAccelerated)
{
config = config
.AddJob(net70.WithId(".NET 7 Vector128"))
.AddJob(net80.WithId(".NET 8 Vector128"));
}

BenchmarkSwitcher.FromAssembly(typeof(Program).Assembly).Run(args, config);
63 changes: 63 additions & 0 deletions NetFabric.ForEachEx.Benchmarks/UnrollBenchmarks.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using BenchmarkDotNet.Attributes;

public class UnrollBenchmarks
{
int[]? array;

[Params(10, 1_000_000)]
public int Count { get; set; }

[GlobalSetup]
public void GlobalSetup()
{
var enumerable = Utils.GetEnumerable(Count, 100);
array = enumerable.ToArray();
}

[Benchmark(Baseline = true)]
public int Baseline()
{
var sum = 0;
foreach(var item in array!)
sum += item;
return sum;
}

[Benchmark]
public int Unrolled()
{
var source = array.AsSpan();
ref var sourceRef = ref MemoryMarshal.GetReference(source);

var sum = 0;
#if NET7_0_OR_GREATER
var index = nint.Zero;
#else
var index = (nint)0;
#endif
var end = source.Length - (source.Length % 4);
while (index < end)
{
sum += Unsafe.Add(ref sourceRef, index);
sum += Unsafe.Add(ref sourceRef, index + 1);
sum += Unsafe.Add(ref sourceRef, index + 2);
sum += Unsafe.Add(ref sourceRef, index + 3);

index += 4;
}

// handle remaining elements
while (index < source.Length)
{
sum += Unsafe.Add(ref sourceRef, index);

index++;
}

return sum;
}
}


4 changes: 2 additions & 2 deletions NetFabric.ForEachEx.Benchmarks/Utils.cs
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
static class Utils
{
public static IEnumerable<int> GetEnumerable(int count)
public static IEnumerable<int> GetEnumerable(int count, int maxValue)
{
var random = new Random(42);
for (var item = 0; item < count; item++)
yield return random.Next(count);
yield return random.Next(maxValue);
}
}
29 changes: 27 additions & 2 deletions NetFabric.ForEachEx/ForEachEx.ValueAction.cs
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ public static partial class Extensions
/// This method enables custom actions to be applied to each element in a collection efficiently
/// by using a value-based action implementation, minimizing overhead.
/// </remarks>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static void ForEachEx<T, TAction>(this IEnumerable<T> source, ref TAction action)
where TAction : struct, IAction<T>
{
Expand Down Expand Up @@ -98,7 +99,31 @@ public static void ForEachEx<T, TAction>(this Span<T> source, ref TAction action
public static void ForEachEx<T, TAction>(this ReadOnlySpan<T> source, ref TAction action)
where TAction : struct, IAction<T>
{
foreach (ref readonly var item in source)
action.Invoke(in item);
#if NET7_0_OR_GREATER
var index = nint.Zero;
#else
var index = (nint)0;
#endif

// use a reference to elide bound chacks
ref var sourceRef = ref MemoryMarshal.GetReference(source);

// unroll iteration for improved performance
var end = source.Length - (source.Length % 4);
while (index < end)
{
action.Invoke(in Unsafe.Add(ref sourceRef, index));
action.Invoke(in Unsafe.Add(ref sourceRef, index + 1));
action.Invoke(in Unsafe.Add(ref sourceRef, index + 2));
action.Invoke(in Unsafe.Add(ref sourceRef, index + 3));
index += 4;
}

// handle remaining elements
while (index < source.Length)
{
action.Invoke(in Unsafe.Add(ref sourceRef, index));
index++;
}
}
}
34 changes: 27 additions & 7 deletions NetFabric.ForEachEx/ForEachEx.ValueVectorAction.cs
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ public static partial class Extensions
/// This method streamlines the process of iterating through a collection and applying a custom action to each element
/// efficiently by leveraging vectorization (SIMD) for enhanced performance on supported types and compatible hardware.
/// </remarks>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static void ForEachVectorEx<T, TAction>(this IEnumerable<T> source, ref TAction action)
where T : struct
where TAction : struct, IVectorAction<T>
Expand Down Expand Up @@ -104,6 +105,12 @@ public static void ForEachVectorEx<T, TAction>(this ReadOnlySpan<T> source, ref
where T : struct
where TAction : struct, IVectorAction<T>
{
#if NET7_0_OR_GREATER
var index = nint.Zero;
#else
var index = (nint)0;
#endif

// Check if hardware acceleration is available and supported data types for SIMD operations.
if (Vector.IsHardwareAccelerated &&
#if NET7_0_OR_GREATER
Expand All @@ -119,18 +126,31 @@ public static void ForEachVectorEx<T, TAction>(this ReadOnlySpan<T> source, ref
action.Invoke(in vector);

// Calculate the remaining elements after processing vectors.
var remaining = source.Length % Vector<T>.Count;

// Reduce the source span to the remaining elements for further processing.
source = source[^remaining..];
index = source.Length - (source.Length % Vector<T>.Count);
}

// Iterate through the remaining elements (or all elements if not using SIMD operations)
// and invoke the action on each individual element.
foreach (ref readonly var item in source)

// use a reference to elide bound checks
ref var sourceRef = ref MemoryMarshal.GetReference(source);

// unroll iteration for improved performance
var end = source.Length - (source.Length % 4);
while (index < end)
{
action.Invoke(in item);
action.Invoke(in Unsafe.Add(ref sourceRef, index));
action.Invoke(in Unsafe.Add(ref sourceRef, index + 1));
action.Invoke(in Unsafe.Add(ref sourceRef, index + 2));
action.Invoke(in Unsafe.Add(ref sourceRef, index + 3));
index += 4;
}
}

// handle remaining elements
while (index < source.Length)
{
action.Invoke(in Unsafe.Add(ref sourceRef, index));
index++;
}
}
}