diff --git a/Cargo.lock b/Cargo.lock index 5dbd6a1..a1a2266 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2922,7 +2922,7 @@ dependencies = [ [[package]] name = "squad-mortar-helper" -version = "0.4.1" +version = "0.4.2" dependencies = [ "build-time", "build_cfg", diff --git a/Cargo.toml b/Cargo.toml index 341cccf..2eabc7e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "squad-mortar-helper" -version = "0.4.1" +version = "0.4.2" edition = "2021" authors = ["William Venner "] publish = false diff --git a/util/src/image.rs b/util/src/image.rs index b0e1c6e..05f5ea9 100644 --- a/util/src/image.rs +++ b/util/src/image.rs @@ -8,7 +8,7 @@ macro_rules! par_iter_pixels { ($image:ident[$x:expr, $y:expr, $w:expr, $h:expr]) => {{ if $x + $w > $image.width() || $y + $h > $image.height() { panic!( - "iter_pixels ({}, {}) to ({}, {}) is outside of bounds for {}x{} image", + "par_iter_pixels ({}, {}) to ({}, {}) is outside of bounds for {}x{} image", $x, $y, ($x + $w) - 1, ($y + $h) - 1, $image.width(), $image.height() @@ -16,70 +16,33 @@ macro_rules! par_iter_pixels { } let self_ = $crate::UnsafeSendPtr::new_const(&$image); - ($x..($x + $w)).into_par_iter().map(move |x| ($y..($y + $h)).into_par_iter().map(move |y| (x, y))).flatten().map(move |(x_, y_)| { + ($y..($y + $h)).into_par_iter().flat_map(move |y| ($x..($x + $w)).into_par_iter().map(move |x| { let self_ = unsafe { self_.as_const() }; #[cfg(debug_assertions)] - let p = self_.get_pixel(x_, y_).clone(); + let p = self_.get_pixel(x, y).clone(); #[cfg(not(debug_assertions))] - let p = unsafe { self_.unsafe_get_pixel(x_, y_) }; + let p = unsafe { self_.unsafe_get_pixel(x, y) }; - (x_, y_, p) - }) + (x, y, p) + })) }}; ($image:ident) => {{ let (w, h) = $image.dimensions(); let self_ = $crate::UnsafeSendPtr::new_const(&$image); - (0..w).into_par_iter().map(move |x| (0..h).into_par_iter().map(move |y| (x, y))).flatten().map(move |(x_, y_)| { + (0..h).into_par_iter().flat_map(move |y| (0..w).into_par_iter().map(move |x| { let self_ = unsafe { self_.as_const() }; #[cfg(debug_assertions)] - let p = self_.get_pixel(x_, y_).clone(); + let p = self_.get_pixel(x, y).clone(); #[cfg(not(debug_assertions))] - let p = unsafe { self_.unsafe_get_pixel(x_, y_) }; + let p = unsafe { self_.unsafe_get_pixel(x, y) }; - (x_, y_, p) - }) - }}; -} - -#[macro_export] -macro_rules! iter_pixels { - ($image:ident[$x:expr, $y:expr, $w:expr, $h:expr]) => {{ - if $x + $w > $image.width() || $y + $h > $image.height() { - panic!( - "iter_pixels ({}, {}) to ({}, {}) is outside of bounds for {}x{} image", - $x, $y, ($x + $w) - 1, ($y + $h) - 1, - $image.width(), - $image.height() - ); - } - - ($x..($x + $w)).into_iter().map(move |x| ($y..($y + $h)).into_iter().map(move |y| (x, y))).flatten().map(|(x_, y_)| { - #[cfg(debug_assertions)] - let p = *$image.get_pixel(x_, y_); - - #[cfg(not(debug_assertions))] - let p = unsafe { $image.unsafe_get_pixel(x_, y_) }; - - (x_, y_, p) - }) - }}; - - ($image:ident) => {{ - let (w, h) = $image.dimensions(); - (0..w).into_iter().map(move |x| (0..h).into_iter().map(move |y| (x, y))).flatten().map(|(x_, y_)| { - #[cfg(debug_assertions)] - let p = *$image.get_pixel(x_, y_); - - #[cfg(not(debug_assertions))] - let p = unsafe { $image.unsafe_get_pixel(x_, y_) }; - - (x_, y_, p) - }) + (x, y, p) + })) }}; } diff --git a/vision-gpu/cuda/cuda.cu b/vision-gpu/cuda/cuda.cu index 7c101a8..a208376 100644 --- a/vision-gpu/cuda/cuda.cu +++ b/vision-gpu/cuda/cuda.cu @@ -458,9 +458,9 @@ edge: } } - for (int32_t xx = x - OCR_PREPROCESS_DILATE_RADIUS; xx <= x + OCR_PREPROCESS_DILATE_RADIUS; xx++) + for (int32_t yy = y - OCR_PREPROCESS_DILATE_RADIUS; yy <= y + OCR_PREPROCESS_DILATE_RADIUS; yy++) { - for (int32_t yy = y - OCR_PREPROCESS_DILATE_RADIUS; yy <= y + OCR_PREPROCESS_DILATE_RADIUS; yy++) + for (int32_t xx = x - OCR_PREPROCESS_DILATE_RADIUS; xx <= x + OCR_PREPROCESS_DILATE_RADIUS; xx++) { if (xx < 0 || xx >= w || yy < 0 || yy >= h) continue; @@ -570,9 +570,9 @@ extern "C" __global__ void filter_map_marker_icons( const uint32_t xx = template_match.xy % stride; const uint32_t yy = template_match.xy / stride; - for (uint32_t marker_x = 0; marker_x < marker_size; marker_x++) + for (uint32_t marker_y = 0; marker_y < marker_size; marker_y++) { - for (uint32_t marker_y = 0; marker_y < marker_size; marker_y++) + for (uint32_t marker_x = 0; marker_x < marker_size; marker_x++) { RGBA marker_pixel = marker[marker_y * marker_size + marker_x]; RGB pixel = input[(yy + marker_y) * stride + (xx + marker_x)]; diff --git a/vision-gpu/cuda/cuda_release.ptx b/vision-gpu/cuda/cuda_release.ptx index 8c365db..7dcdaf7 100644 --- a/vision-gpu/cuda/cuda_release.ptx +++ b/vision-gpu/cuda/cuda_release.ptx @@ -442,27 +442,27 @@ $L__BB4_3: or.pred %p15, %p13, %p14; @%p15 bra $L__BB4_15; - add.s32 %r105, %r1, -3; - add.s32 %r42, %r1, 3; + add.s32 %r105, %r2, -3; + add.s32 %r42, %r2, 3; setp.gt.u32 %p16, %r105, %r42; @%p16 bra $L__BB4_15; $L__BB4_7: - add.s32 %r106, %r2, -3; - add.s32 %r48, %r2, 3; + add.s32 %r106, %r1, -3; + add.s32 %r48, %r1, 3; setp.gt.u32 %p17, %r106, %r48; @%p17 bra $L__BB4_14; $L__BB4_9: or.b32 %r53, %r106, %r105; setp.lt.s32 %p18, %r53, 0; - setp.ge.u32 %p19, %r105, %r9; + setp.ge.u32 %p19, %r106, %r9; or.pred %p20, %p19, %p18; - setp.ge.u32 %p21, %r106, %r10; + setp.ge.u32 %p21, %r105, %r10; or.pred %p22, %p21, %p20; @%p22 bra $L__BB4_13; - mad.lo.s32 %r54, %r106, %r9, %r105; + mad.lo.s32 %r54, %r105, %r9, %r106; mul.wide.u32 %rd7, %r54, 3; add.s64 %rd8, %rd3, %rd7; ld.global.u8 %rs5, [%rd8]; @@ -836,186 +836,324 @@ $L__BB6_13: .param .u32 filter_map_marker_icons_param_6 ) { - .reg .pred %p<9>; - .reg .b16 %rs<17>; - .reg .f32 %f<7>; - .reg .b32 %r<98>; - .reg .f64 %fd<13>; - .reg .b64 %rd<25>; - - - ld.param.u64 %rd6, [filter_map_marker_icons_param_0]; - ld.param.u32 %r27, [filter_map_marker_icons_param_1]; - ld.param.u64 %rd4, [filter_map_marker_icons_param_2]; - ld.param.u64 %rd5, [filter_map_marker_icons_param_3]; - ld.param.u32 %r28, [filter_map_marker_icons_param_4]; - ld.param.u32 %r29, [filter_map_marker_icons_param_5]; - ld.param.u32 %r30, [filter_map_marker_icons_param_6]; - cvta.to.global.u64 %rd1, %rd6; - mov.u32 %r31, %ntid.x; - mov.u32 %r32, %ctaid.x; - mov.u32 %r33, %tid.x; - mad.lo.s32 %r1, %r32, %r31, %r33; - mov.u32 %r34, %ntid.y; - mov.u32 %r35, %ctaid.y; - mov.u32 %r36, %tid.y; - mad.lo.s32 %r2, %r35, %r34, %r36; - setp.ge.u32 %p1, %r1, %r29; - setp.ge.u32 %p2, %r2, %r30; + .reg .pred %p<11>; + .reg .b16 %rs<33>; + .reg .f32 %f<15>; + .reg .b32 %r<167>; + .reg .f64 %fd<29>; + .reg .b64 %rd<42>; + + + ld.param.u64 %rd7, [filter_map_marker_icons_param_0]; + ld.param.u32 %r26, [filter_map_marker_icons_param_1]; + ld.param.u64 %rd5, [filter_map_marker_icons_param_2]; + ld.param.u64 %rd6, [filter_map_marker_icons_param_3]; + ld.param.u32 %r27, [filter_map_marker_icons_param_4]; + ld.param.u32 %r28, [filter_map_marker_icons_param_5]; + ld.param.u32 %r29, [filter_map_marker_icons_param_6]; + cvta.to.global.u64 %rd1, %rd7; + mov.u32 %r30, %ntid.x; + mov.u32 %r31, %ctaid.x; + mov.u32 %r32, %tid.x; + mad.lo.s32 %r1, %r31, %r30, %r32; + mov.u32 %r33, %ntid.y; + mov.u32 %r34, %ctaid.y; + mov.u32 %r35, %tid.y; + mad.lo.s32 %r2, %r34, %r33, %r35; + setp.ge.u32 %p1, %r1, %r28; + setp.ge.u32 %p2, %r2, %r29; or.pred %p3, %p1, %p2; - @%p3 bra $L__BB7_9; - - cvta.to.global.u64 %rd7, %rd5; - mul.wide.u32 %rd8, %r1, 8; - add.s64 %rd9, %rd7, %rd8; - ld.global.u64 %rd10, [%rd9]; - cvta.to.global.u64 %rd2, %rd10; - cvta.to.global.u64 %rd11, %rd4; - mul.wide.u32 %rd12, %r2, 8; - add.s64 %rd3, %rd11, %rd12; + @%p3 bra $L__BB7_11; + + cvta.to.global.u64 %rd8, %rd6; + mul.wide.u32 %rd9, %r1, 8; + add.s64 %rd10, %rd8, %rd9; + ld.global.u64 %rd11, [%rd10]; + cvta.to.global.u64 %rd2, %rd11; + cvta.to.global.u64 %rd12, %rd5; + mul.wide.u32 %rd13, %r2, 8; + add.s64 %rd3, %rd12, %rd13; ld.global.u32 %r3, [%rd3]; - setp.eq.s32 %p4, %r28, 0; - @%p4 bra $L__BB7_9; - - and.b32 %r4, %r28, 1; - shl.b32 %r5, %r28, 1; - add.s32 %r6, %r3, %r27; - shl.b32 %r7, %r27, 1; - sub.s32 %r8, %r4, %r28; - div.u32 %r9, %r3, %r27; - mul.lo.s32 %r38, %r9, %r27; - sub.s32 %r10, %r3, %r38; - mov.u32 %r37, 0; - setp.eq.s32 %p5, %r28, 1; - setp.eq.s32 %p7, %r4, 0; - mov.u32 %r91, %r37; + setp.eq.s32 %p4, %r27, 0; + @%p4 bra $L__BB7_11; + + add.s32 %r4, %r27, -1; + add.s32 %r5, %r3, 3; + and.b32 %r6, %r27, 3; + sub.s32 %r7, %r6, %r27; + div.u32 %r8, %r3, %r26; + mul.lo.s32 %r37, %r8, %r26; + sub.s32 %r9, %r3, %r37; + mov.u32 %r36, 0; + setp.lt.u32 %p5, %r4, 3; + setp.eq.s32 %p7, %r6, 0; + setp.eq.s32 %p8, %r6, 1; + setp.eq.s32 %p9, %r6, 2; + mov.u32 %r162, %r36; $L__BB7_3: - mov.u32 %r97, %r37; + mul.lo.s32 %r11, %r162, %r27; + add.s32 %r39, %r162, %r8; + mad.lo.s32 %r12, %r39, %r26, %r9; + mov.u32 %r166, %r36; @%p5 bra $L__BB7_6; - ld.global.u16 %rs16, [%rd3+4]; - add.s32 %r95, %r28, %r91; - add.s32 %r93, %r6, %r91; - add.s32 %r92, %r3, %r91; - mov.u32 %r94, %r91; - mov.u32 %r97, %r37; + ld.global.u16 %rs32, [%rd3+4]; + mul.lo.s32 %r41, %r26, %r162; + add.s32 %r164, %r5, %r41; + add.s32 %r163, %r11, 3; + add.s32 %r15, %r3, %r41; + add.s32 %r16, %r15, 1; + add.s32 %r17, %r11, 1; + mov.u32 %r166, %r36; $L__BB7_5: - mul.wide.u32 %rd13, %r94, 4; - add.s64 %rd14, %rd2, %rd13; - mul.wide.u32 %rd15, %r92, 3; - add.s64 %rd16, %rd1, %rd15; - ld.global.u8 %r41, [%rd16]; - ld.global.u8 %r42, [%rd14]; - sub.s32 %r43, %r41, %r42; - abs.s32 %r44, %r43; - ld.global.u8 %r45, [%rd16+1]; - ld.global.u8 %r46, [%rd14+1]; - sub.s32 %r47, %r45, %r46; - abs.s32 %r48, %r47; - add.s32 %r49, %r48, %r44; - ld.global.u8 %r50, [%rd16+2]; - ld.global.u8 %r51, [%rd14+2]; - sub.s32 %r52, %r50, %r51; - abs.s32 %r53, %r52; - add.s32 %r54, %r49, %r53; - cvt.u16.u32 %rs4, %r54; - cvt.rn.f32.u16 %f1, %rs4; + add.s32 %r42, %r11, %r166; + mul.wide.u32 %rd14, %r42, 4; + add.s64 %rd15, %rd2, %rd14; + add.s32 %r43, %r15, %r166; + mul.wide.u32 %rd16, %r43, 3; + add.s64 %rd17, %rd1, %rd16; + ld.global.u8 %r44, [%rd17]; + ld.global.u8 %r45, [%rd15]; + sub.s32 %r46, %r44, %r45; + abs.s32 %r47, %r46; + ld.global.u8 %r48, [%rd17+1]; + ld.global.u8 %r49, [%rd15+1]; + sub.s32 %r50, %r48, %r49; + abs.s32 %r51, %r50; + add.s32 %r52, %r51, %r47; + ld.global.u8 %r53, [%rd17+2]; + ld.global.u8 %r54, [%rd15+2]; + sub.s32 %r55, %r53, %r54; + abs.s32 %r56, %r55; + add.s32 %r57, %r52, %r56; + cvt.u16.u32 %rs6, %r57; + cvt.rn.f32.u16 %f1, %rs6; cvt.f64.f32 %fd1, %f1; - ld.global.u8 %rs5, [%rd14+3]; - cvt.rn.f32.u16 %f2, %rs5; + ld.global.u8 %rs7, [%rd15+3]; + cvt.rn.f32.u16 %f2, %rs7; cvt.f64.f32 %fd2, %f2; div.rn.f64 %fd3, %fd2, 0d406FE00000000000; mul.f64 %fd4, %fd3, %fd1; - cvt.rzi.u32.f64 %r55, %fd4; - cvt.u16.u32 %rs6, %r55; - add.s16 %rs7, %rs16, %rs6; - st.global.u16 [%rd3+4], %rs7; - mul.wide.u32 %rd17, %r95, 4; - add.s64 %rd18, %rd2, %rd17; - mul.wide.u32 %rd19, %r93, 3; - add.s64 %rd20, %rd1, %rd19; - ld.global.u8 %r56, [%rd20]; - ld.global.u8 %r57, [%rd18]; - sub.s32 %r58, %r56, %r57; - abs.s32 %r59, %r58; - ld.global.u8 %r60, [%rd20+1]; - ld.global.u8 %r61, [%rd18+1]; - sub.s32 %r62, %r60, %r61; - abs.s32 %r63, %r62; - add.s32 %r64, %r63, %r59; - ld.global.u8 %r65, [%rd20+2]; - ld.global.u8 %r66, [%rd18+2]; + cvt.rzi.u32.f64 %r58, %fd4; + cvt.u16.u32 %rs8, %r58; + add.s16 %rs9, %rs32, %rs8; + st.global.u16 [%rd3+4], %rs9; + add.s32 %r59, %r17, %r166; + mul.wide.u32 %rd18, %r59, 4; + add.s64 %rd19, %rd2, %rd18; + add.s32 %r60, %r16, %r166; + mul.wide.u32 %rd20, %r60, 3; + add.s64 %rd21, %rd1, %rd20; + ld.global.u8 %r61, [%rd21]; + ld.global.u8 %r62, [%rd19]; + sub.s32 %r63, %r61, %r62; + abs.s32 %r64, %r63; + ld.global.u8 %r65, [%rd21+1]; + ld.global.u8 %r66, [%rd19+1]; sub.s32 %r67, %r65, %r66; abs.s32 %r68, %r67; - add.s32 %r69, %r64, %r68; - cvt.u16.u32 %rs8, %r69; - cvt.rn.f32.u16 %f3, %rs8; + add.s32 %r69, %r68, %r64; + ld.global.u8 %r70, [%rd21+2]; + ld.global.u8 %r71, [%rd19+2]; + sub.s32 %r72, %r70, %r71; + abs.s32 %r73, %r72; + add.s32 %r74, %r69, %r73; + cvt.u16.u32 %rs10, %r74; + cvt.rn.f32.u16 %f3, %rs10; cvt.f64.f32 %fd5, %f3; - ld.global.u8 %rs9, [%rd18+3]; - cvt.rn.f32.u16 %f4, %rs9; + ld.global.u8 %rs11, [%rd19+3]; + cvt.rn.f32.u16 %f4, %rs11; cvt.f64.f32 %fd6, %f4; div.rn.f64 %fd7, %fd6, 0d406FE00000000000; mul.f64 %fd8, %fd7, %fd5; - cvt.rzi.u32.f64 %r70, %fd8; - cvt.u16.u32 %rs10, %r70; - add.s16 %rs16, %rs7, %rs10; - st.global.u16 [%rd3+4], %rs16; - add.s32 %r95, %r95, %r5; - add.s32 %r94, %r94, %r5; - add.s32 %r93, %r93, %r7; - add.s32 %r92, %r92, %r7; - add.s32 %r97, %r97, 2; - add.s32 %r71, %r8, %r97; - setp.ne.s32 %p6, %r71, 0; - @%p6 bra $L__BB7_5; - -$L__BB7_6: - @%p7 bra $L__BB7_8; - - mad.lo.s32 %r72, %r97, %r28, %r91; - mul.wide.u32 %rd21, %r72, 4; - add.s64 %rd22, %rd2, %rd21; - add.s32 %r73, %r97, %r9; - add.s32 %r74, %r91, %r10; - mad.lo.s32 %r75, %r73, %r27, %r74; - mul.wide.u32 %rd23, %r75, 3; - add.s64 %rd24, %rd1, %rd23; - ld.global.u8 %r76, [%rd24]; - ld.global.u8 %r77, [%rd22]; - sub.s32 %r78, %r76, %r77; - abs.s32 %r79, %r78; - ld.global.u8 %r80, [%rd24+1]; - ld.global.u8 %r81, [%rd22+1]; - sub.s32 %r82, %r80, %r81; - abs.s32 %r83, %r82; - add.s32 %r84, %r83, %r79; - ld.global.u8 %r85, [%rd24+2]; - ld.global.u8 %r86, [%rd22+2]; - sub.s32 %r87, %r85, %r86; - abs.s32 %r88, %r87; - add.s32 %r89, %r84, %r88; - cvt.u16.u32 %rs11, %r89; - cvt.rn.f32.u16 %f5, %rs11; + cvt.rzi.u32.f64 %r75, %fd8; + cvt.u16.u32 %rs12, %r75; + add.s16 %rs13, %rs9, %rs12; + st.global.u16 [%rd3+4], %rs13; + add.s32 %r76, %r163, -1; + mul.wide.u32 %rd22, %r76, 4; + add.s64 %rd23, %rd2, %rd22; + add.s32 %r77, %r164, -1; + mul.wide.u32 %rd24, %r77, 3; + add.s64 %rd25, %rd1, %rd24; + ld.global.u8 %r78, [%rd25]; + ld.global.u8 %r79, [%rd23]; + sub.s32 %r80, %r78, %r79; + abs.s32 %r81, %r80; + ld.global.u8 %r82, [%rd25+1]; + ld.global.u8 %r83, [%rd23+1]; + sub.s32 %r84, %r82, %r83; + abs.s32 %r85, %r84; + add.s32 %r86, %r85, %r81; + ld.global.u8 %r87, [%rd25+2]; + ld.global.u8 %r88, [%rd23+2]; + sub.s32 %r89, %r87, %r88; + abs.s32 %r90, %r89; + add.s32 %r91, %r86, %r90; + cvt.u16.u32 %rs14, %r91; + cvt.rn.f32.u16 %f5, %rs14; cvt.f64.f32 %fd9, %f5; - ld.global.u8 %rs12, [%rd22+3]; - cvt.rn.f32.u16 %f6, %rs12; + ld.global.u8 %rs15, [%rd23+3]; + cvt.rn.f32.u16 %f6, %rs15; cvt.f64.f32 %fd10, %f6; div.rn.f64 %fd11, %fd10, 0d406FE00000000000; mul.f64 %fd12, %fd11, %fd9; - cvt.rzi.u32.f64 %r90, %fd12; - cvt.u16.u32 %rs13, %r90; - ld.global.u16 %rs14, [%rd3+4]; - add.s16 %rs15, %rs14, %rs13; - st.global.u16 [%rd3+4], %rs15; - -$L__BB7_8: - add.s32 %r91, %r91, 1; - setp.lt.u32 %p8, %r91, %r28; - @%p8 bra $L__BB7_3; - -$L__BB7_9: + cvt.rzi.u32.f64 %r92, %fd12; + cvt.u16.u32 %rs16, %r92; + add.s16 %rs17, %rs13, %rs16; + st.global.u16 [%rd3+4], %rs17; + mul.wide.u32 %rd26, %r163, 4; + add.s64 %rd27, %rd2, %rd26; + mul.wide.u32 %rd28, %r164, 3; + add.s64 %rd29, %rd1, %rd28; + ld.global.u8 %r93, [%rd29]; + ld.global.u8 %r94, [%rd27]; + sub.s32 %r95, %r93, %r94; + abs.s32 %r96, %r95; + ld.global.u8 %r97, [%rd29+1]; + ld.global.u8 %r98, [%rd27+1]; + sub.s32 %r99, %r97, %r98; + abs.s32 %r100, %r99; + add.s32 %r101, %r100, %r96; + ld.global.u8 %r102, [%rd29+2]; + ld.global.u8 %r103, [%rd27+2]; + sub.s32 %r104, %r102, %r103; + abs.s32 %r105, %r104; + add.s32 %r106, %r101, %r105; + cvt.u16.u32 %rs18, %r106; + cvt.rn.f32.u16 %f7, %rs18; + cvt.f64.f32 %fd13, %f7; + ld.global.u8 %rs19, [%rd27+3]; + cvt.rn.f32.u16 %f8, %rs19; + cvt.f64.f32 %fd14, %f8; + div.rn.f64 %fd15, %fd14, 0d406FE00000000000; + mul.f64 %fd16, %fd15, %fd13; + cvt.rzi.u32.f64 %r107, %fd16; + cvt.u16.u32 %rs20, %r107; + add.s16 %rs32, %rs17, %rs20; + st.global.u16 [%rd3+4], %rs32; + add.s32 %r164, %r164, 4; + add.s32 %r163, %r163, 4; + add.s32 %r166, %r166, 4; + add.s32 %r108, %r7, %r166; + setp.ne.s32 %p6, %r108, 0; + @%p6 bra $L__BB7_5; + +$L__BB7_6: + @%p7 bra $L__BB7_10; + + add.s32 %r109, %r166, %r11; + mul.wide.u32 %rd30, %r109, 4; + add.s64 %rd31, %rd2, %rd30; + add.s32 %r110, %r12, %r166; + mul.wide.u32 %rd32, %r110, 3; + add.s64 %rd33, %rd1, %rd32; + ld.global.u8 %r111, [%rd33]; + ld.global.u8 %r112, [%rd31]; + sub.s32 %r113, %r111, %r112; + abs.s32 %r114, %r113; + ld.global.u8 %r115, [%rd33+1]; + ld.global.u8 %r116, [%rd31+1]; + sub.s32 %r117, %r115, %r116; + abs.s32 %r118, %r117; + add.s32 %r119, %r118, %r114; + ld.global.u8 %r120, [%rd33+2]; + ld.global.u8 %r121, [%rd31+2]; + sub.s32 %r122, %r120, %r121; + abs.s32 %r123, %r122; + add.s32 %r124, %r119, %r123; + cvt.u16.u32 %rs21, %r124; + cvt.rn.f32.u16 %f9, %rs21; + cvt.f64.f32 %fd17, %f9; + ld.global.u8 %rs22, [%rd31+3]; + cvt.rn.f32.u16 %f10, %rs22; + cvt.f64.f32 %fd18, %f10; + div.rn.f64 %fd19, %fd18, 0d406FE00000000000; + mul.f64 %fd20, %fd19, %fd17; + cvt.rzi.u32.f64 %r125, %fd20; + cvt.u16.u32 %rs23, %r125; + ld.global.u16 %rs24, [%rd3+4]; + add.s16 %rs4, %rs24, %rs23; + st.global.u16 [%rd3+4], %rs4; + @%p8 bra $L__BB7_10; + + add.s32 %r126, %r166, 1; + add.s32 %r127, %r126, %r11; + mul.wide.u32 %rd34, %r127, 4; + add.s64 %rd35, %rd2, %rd34; + add.s32 %r128, %r12, %r126; + mul.wide.u32 %rd36, %r128, 3; + add.s64 %rd37, %rd1, %rd36; + ld.global.u8 %r129, [%rd37]; + ld.global.u8 %r130, [%rd35]; + sub.s32 %r131, %r129, %r130; + abs.s32 %r132, %r131; + ld.global.u8 %r133, [%rd37+1]; + ld.global.u8 %r134, [%rd35+1]; + sub.s32 %r135, %r133, %r134; + abs.s32 %r136, %r135; + add.s32 %r137, %r136, %r132; + ld.global.u8 %r138, [%rd37+2]; + ld.global.u8 %r139, [%rd35+2]; + sub.s32 %r140, %r138, %r139; + abs.s32 %r141, %r140; + add.s32 %r142, %r137, %r141; + cvt.u16.u32 %rs25, %r142; + cvt.rn.f32.u16 %f11, %rs25; + cvt.f64.f32 %fd21, %f11; + ld.global.u8 %rs26, [%rd35+3]; + cvt.rn.f32.u16 %f12, %rs26; + cvt.f64.f32 %fd22, %f12; + div.rn.f64 %fd23, %fd22, 0d406FE00000000000; + mul.f64 %fd24, %fd23, %fd21; + cvt.rzi.u32.f64 %r143, %fd24; + cvt.u16.u32 %rs27, %r143; + add.s16 %rs5, %rs4, %rs27; + st.global.u16 [%rd3+4], %rs5; + @%p9 bra $L__BB7_10; + + add.s32 %r144, %r166, 2; + add.s32 %r145, %r144, %r11; + mul.wide.u32 %rd38, %r145, 4; + add.s64 %rd39, %rd2, %rd38; + add.s32 %r146, %r12, %r144; + mul.wide.u32 %rd40, %r146, 3; + add.s64 %rd41, %rd1, %rd40; + ld.global.u8 %r147, [%rd41]; + ld.global.u8 %r148, [%rd39]; + sub.s32 %r149, %r147, %r148; + abs.s32 %r150, %r149; + ld.global.u8 %r151, [%rd41+1]; + ld.global.u8 %r152, [%rd39+1]; + sub.s32 %r153, %r151, %r152; + abs.s32 %r154, %r153; + add.s32 %r155, %r154, %r150; + ld.global.u8 %r156, [%rd41+2]; + ld.global.u8 %r157, [%rd39+2]; + sub.s32 %r158, %r156, %r157; + abs.s32 %r159, %r158; + add.s32 %r160, %r155, %r159; + cvt.u16.u32 %rs28, %r160; + cvt.rn.f32.u16 %f13, %rs28; + cvt.f64.f32 %fd25, %f13; + ld.global.u8 %rs29, [%rd39+3]; + cvt.rn.f32.u16 %f14, %rs29; + cvt.f64.f32 %fd26, %f14; + div.rn.f64 %fd27, %fd26, 0d406FE00000000000; + mul.f64 %fd28, %fd27, %fd25; + cvt.rzi.u32.f64 %r161, %fd28; + cvt.u16.u32 %rs30, %r161; + add.s16 %rs31, %rs5, %rs30; + st.global.u16 [%rd3+4], %rs31; + +$L__BB7_10: + add.s32 %r162, %r162, 1; + setp.lt.u32 %p10, %r162, %r27; + @%p10 bra $L__BB7_3; + +$L__BB7_11: ret; } diff --git a/vision-gpu/cuda/cuda_release_35.ptx b/vision-gpu/cuda/cuda_release_35.ptx index 33fa3e5..38a3f12 100644 --- a/vision-gpu/cuda/cuda_release_35.ptx +++ b/vision-gpu/cuda/cuda_release_35.ptx @@ -442,27 +442,27 @@ $L__BB4_3: or.pred %p15, %p13, %p14; @%p15 bra $L__BB4_15; - add.s32 %r105, %r1, -3; - add.s32 %r42, %r1, 3; + add.s32 %r105, %r2, -3; + add.s32 %r42, %r2, 3; setp.gt.u32 %p16, %r105, %r42; @%p16 bra $L__BB4_15; $L__BB4_7: - add.s32 %r106, %r2, -3; - add.s32 %r48, %r2, 3; + add.s32 %r106, %r1, -3; + add.s32 %r48, %r1, 3; setp.gt.u32 %p17, %r106, %r48; @%p17 bra $L__BB4_14; $L__BB4_9: or.b32 %r53, %r106, %r105; setp.lt.s32 %p18, %r53, 0; - setp.ge.u32 %p19, %r105, %r9; + setp.ge.u32 %p19, %r106, %r9; or.pred %p20, %p19, %p18; - setp.ge.u32 %p21, %r106, %r10; + setp.ge.u32 %p21, %r105, %r10; or.pred %p22, %p21, %p20; @%p22 bra $L__BB4_13; - mad.lo.s32 %r54, %r106, %r9, %r105; + mad.lo.s32 %r54, %r105, %r9, %r106; mul.wide.u32 %rd7, %r54, 3; add.s64 %rd8, %rd3, %rd7; ld.global.u8 %rs5, [%rd8]; @@ -836,186 +836,324 @@ $L__BB6_13: .param .u32 filter_map_marker_icons_param_6 ) { - .reg .pred %p<9>; - .reg .b16 %rs<17>; - .reg .f32 %f<7>; - .reg .b32 %r<98>; - .reg .f64 %fd<13>; - .reg .b64 %rd<25>; - - - ld.param.u64 %rd6, [filter_map_marker_icons_param_0]; - ld.param.u32 %r27, [filter_map_marker_icons_param_1]; - ld.param.u64 %rd4, [filter_map_marker_icons_param_2]; - ld.param.u64 %rd5, [filter_map_marker_icons_param_3]; - ld.param.u32 %r28, [filter_map_marker_icons_param_4]; - ld.param.u32 %r29, [filter_map_marker_icons_param_5]; - ld.param.u32 %r30, [filter_map_marker_icons_param_6]; - cvta.to.global.u64 %rd1, %rd6; - mov.u32 %r31, %ntid.x; - mov.u32 %r32, %ctaid.x; - mov.u32 %r33, %tid.x; - mad.lo.s32 %r1, %r32, %r31, %r33; - mov.u32 %r34, %ntid.y; - mov.u32 %r35, %ctaid.y; - mov.u32 %r36, %tid.y; - mad.lo.s32 %r2, %r35, %r34, %r36; - setp.ge.u32 %p1, %r1, %r29; - setp.ge.u32 %p2, %r2, %r30; + .reg .pred %p<11>; + .reg .b16 %rs<33>; + .reg .f32 %f<15>; + .reg .b32 %r<167>; + .reg .f64 %fd<29>; + .reg .b64 %rd<42>; + + + ld.param.u64 %rd7, [filter_map_marker_icons_param_0]; + ld.param.u32 %r26, [filter_map_marker_icons_param_1]; + ld.param.u64 %rd5, [filter_map_marker_icons_param_2]; + ld.param.u64 %rd6, [filter_map_marker_icons_param_3]; + ld.param.u32 %r27, [filter_map_marker_icons_param_4]; + ld.param.u32 %r28, [filter_map_marker_icons_param_5]; + ld.param.u32 %r29, [filter_map_marker_icons_param_6]; + cvta.to.global.u64 %rd1, %rd7; + mov.u32 %r30, %ntid.x; + mov.u32 %r31, %ctaid.x; + mov.u32 %r32, %tid.x; + mad.lo.s32 %r1, %r31, %r30, %r32; + mov.u32 %r33, %ntid.y; + mov.u32 %r34, %ctaid.y; + mov.u32 %r35, %tid.y; + mad.lo.s32 %r2, %r34, %r33, %r35; + setp.ge.u32 %p1, %r1, %r28; + setp.ge.u32 %p2, %r2, %r29; or.pred %p3, %p1, %p2; - @%p3 bra $L__BB7_9; - - cvta.to.global.u64 %rd7, %rd5; - mul.wide.u32 %rd8, %r1, 8; - add.s64 %rd9, %rd7, %rd8; - ld.global.u64 %rd10, [%rd9]; - cvta.to.global.u64 %rd2, %rd10; - cvta.to.global.u64 %rd11, %rd4; - mul.wide.u32 %rd12, %r2, 8; - add.s64 %rd3, %rd11, %rd12; + @%p3 bra $L__BB7_11; + + cvta.to.global.u64 %rd8, %rd6; + mul.wide.u32 %rd9, %r1, 8; + add.s64 %rd10, %rd8, %rd9; + ld.global.u64 %rd11, [%rd10]; + cvta.to.global.u64 %rd2, %rd11; + cvta.to.global.u64 %rd12, %rd5; + mul.wide.u32 %rd13, %r2, 8; + add.s64 %rd3, %rd12, %rd13; ld.global.u32 %r3, [%rd3]; - setp.eq.s32 %p4, %r28, 0; - @%p4 bra $L__BB7_9; - - and.b32 %r4, %r28, 1; - shl.b32 %r5, %r28, 1; - add.s32 %r6, %r3, %r27; - shl.b32 %r7, %r27, 1; - sub.s32 %r8, %r4, %r28; - div.u32 %r9, %r3, %r27; - mul.lo.s32 %r38, %r9, %r27; - sub.s32 %r10, %r3, %r38; - mov.u32 %r37, 0; - setp.eq.s32 %p5, %r28, 1; - setp.eq.s32 %p7, %r4, 0; - mov.u32 %r91, %r37; + setp.eq.s32 %p4, %r27, 0; + @%p4 bra $L__BB7_11; + + add.s32 %r4, %r27, -1; + add.s32 %r5, %r3, 3; + and.b32 %r6, %r27, 3; + sub.s32 %r7, %r6, %r27; + div.u32 %r8, %r3, %r26; + mul.lo.s32 %r37, %r8, %r26; + sub.s32 %r9, %r3, %r37; + mov.u32 %r36, 0; + setp.lt.u32 %p5, %r4, 3; + setp.eq.s32 %p7, %r6, 0; + setp.eq.s32 %p8, %r6, 1; + setp.eq.s32 %p9, %r6, 2; + mov.u32 %r162, %r36; $L__BB7_3: - mov.u32 %r97, %r37; + mul.lo.s32 %r11, %r162, %r27; + add.s32 %r39, %r162, %r8; + mad.lo.s32 %r12, %r39, %r26, %r9; + mov.u32 %r166, %r36; @%p5 bra $L__BB7_6; - ld.global.u16 %rs16, [%rd3+4]; - add.s32 %r95, %r28, %r91; - add.s32 %r93, %r6, %r91; - add.s32 %r92, %r3, %r91; - mov.u32 %r94, %r91; - mov.u32 %r97, %r37; + ld.global.u16 %rs32, [%rd3+4]; + mul.lo.s32 %r41, %r26, %r162; + add.s32 %r164, %r5, %r41; + add.s32 %r163, %r11, 3; + add.s32 %r15, %r3, %r41; + add.s32 %r16, %r15, 1; + add.s32 %r17, %r11, 1; + mov.u32 %r166, %r36; $L__BB7_5: - mul.wide.u32 %rd13, %r94, 4; - add.s64 %rd14, %rd2, %rd13; - mul.wide.u32 %rd15, %r92, 3; - add.s64 %rd16, %rd1, %rd15; - ld.global.u8 %r41, [%rd16]; - ld.global.u8 %r42, [%rd14]; - sub.s32 %r43, %r41, %r42; - abs.s32 %r44, %r43; - ld.global.u8 %r45, [%rd16+1]; - ld.global.u8 %r46, [%rd14+1]; - sub.s32 %r47, %r45, %r46; - abs.s32 %r48, %r47; - add.s32 %r49, %r48, %r44; - ld.global.u8 %r50, [%rd16+2]; - ld.global.u8 %r51, [%rd14+2]; - sub.s32 %r52, %r50, %r51; - abs.s32 %r53, %r52; - add.s32 %r54, %r49, %r53; - cvt.u16.u32 %rs4, %r54; - cvt.rn.f32.u16 %f1, %rs4; + add.s32 %r42, %r11, %r166; + mul.wide.u32 %rd14, %r42, 4; + add.s64 %rd15, %rd2, %rd14; + add.s32 %r43, %r15, %r166; + mul.wide.u32 %rd16, %r43, 3; + add.s64 %rd17, %rd1, %rd16; + ld.global.u8 %r44, [%rd17]; + ld.global.u8 %r45, [%rd15]; + sub.s32 %r46, %r44, %r45; + abs.s32 %r47, %r46; + ld.global.u8 %r48, [%rd17+1]; + ld.global.u8 %r49, [%rd15+1]; + sub.s32 %r50, %r48, %r49; + abs.s32 %r51, %r50; + add.s32 %r52, %r51, %r47; + ld.global.u8 %r53, [%rd17+2]; + ld.global.u8 %r54, [%rd15+2]; + sub.s32 %r55, %r53, %r54; + abs.s32 %r56, %r55; + add.s32 %r57, %r52, %r56; + cvt.u16.u32 %rs6, %r57; + cvt.rn.f32.u16 %f1, %rs6; cvt.f64.f32 %fd1, %f1; - ld.global.u8 %rs5, [%rd14+3]; - cvt.rn.f32.u16 %f2, %rs5; + ld.global.u8 %rs7, [%rd15+3]; + cvt.rn.f32.u16 %f2, %rs7; cvt.f64.f32 %fd2, %f2; div.rn.f64 %fd3, %fd2, 0d406FE00000000000; mul.f64 %fd4, %fd3, %fd1; - cvt.rzi.u32.f64 %r55, %fd4; - cvt.u16.u32 %rs6, %r55; - add.s16 %rs7, %rs16, %rs6; - st.global.u16 [%rd3+4], %rs7; - mul.wide.u32 %rd17, %r95, 4; - add.s64 %rd18, %rd2, %rd17; - mul.wide.u32 %rd19, %r93, 3; - add.s64 %rd20, %rd1, %rd19; - ld.global.u8 %r56, [%rd20]; - ld.global.u8 %r57, [%rd18]; - sub.s32 %r58, %r56, %r57; - abs.s32 %r59, %r58; - ld.global.u8 %r60, [%rd20+1]; - ld.global.u8 %r61, [%rd18+1]; - sub.s32 %r62, %r60, %r61; - abs.s32 %r63, %r62; - add.s32 %r64, %r63, %r59; - ld.global.u8 %r65, [%rd20+2]; - ld.global.u8 %r66, [%rd18+2]; + cvt.rzi.u32.f64 %r58, %fd4; + cvt.u16.u32 %rs8, %r58; + add.s16 %rs9, %rs32, %rs8; + st.global.u16 [%rd3+4], %rs9; + add.s32 %r59, %r17, %r166; + mul.wide.u32 %rd18, %r59, 4; + add.s64 %rd19, %rd2, %rd18; + add.s32 %r60, %r16, %r166; + mul.wide.u32 %rd20, %r60, 3; + add.s64 %rd21, %rd1, %rd20; + ld.global.u8 %r61, [%rd21]; + ld.global.u8 %r62, [%rd19]; + sub.s32 %r63, %r61, %r62; + abs.s32 %r64, %r63; + ld.global.u8 %r65, [%rd21+1]; + ld.global.u8 %r66, [%rd19+1]; sub.s32 %r67, %r65, %r66; abs.s32 %r68, %r67; - add.s32 %r69, %r64, %r68; - cvt.u16.u32 %rs8, %r69; - cvt.rn.f32.u16 %f3, %rs8; + add.s32 %r69, %r68, %r64; + ld.global.u8 %r70, [%rd21+2]; + ld.global.u8 %r71, [%rd19+2]; + sub.s32 %r72, %r70, %r71; + abs.s32 %r73, %r72; + add.s32 %r74, %r69, %r73; + cvt.u16.u32 %rs10, %r74; + cvt.rn.f32.u16 %f3, %rs10; cvt.f64.f32 %fd5, %f3; - ld.global.u8 %rs9, [%rd18+3]; - cvt.rn.f32.u16 %f4, %rs9; + ld.global.u8 %rs11, [%rd19+3]; + cvt.rn.f32.u16 %f4, %rs11; cvt.f64.f32 %fd6, %f4; div.rn.f64 %fd7, %fd6, 0d406FE00000000000; mul.f64 %fd8, %fd7, %fd5; - cvt.rzi.u32.f64 %r70, %fd8; - cvt.u16.u32 %rs10, %r70; - add.s16 %rs16, %rs7, %rs10; - st.global.u16 [%rd3+4], %rs16; - add.s32 %r95, %r95, %r5; - add.s32 %r94, %r94, %r5; - add.s32 %r93, %r93, %r7; - add.s32 %r92, %r92, %r7; - add.s32 %r97, %r97, 2; - add.s32 %r71, %r8, %r97; - setp.ne.s32 %p6, %r71, 0; - @%p6 bra $L__BB7_5; - -$L__BB7_6: - @%p7 bra $L__BB7_8; - - mad.lo.s32 %r72, %r97, %r28, %r91; - mul.wide.u32 %rd21, %r72, 4; - add.s64 %rd22, %rd2, %rd21; - add.s32 %r73, %r97, %r9; - add.s32 %r74, %r91, %r10; - mad.lo.s32 %r75, %r73, %r27, %r74; - mul.wide.u32 %rd23, %r75, 3; - add.s64 %rd24, %rd1, %rd23; - ld.global.u8 %r76, [%rd24]; - ld.global.u8 %r77, [%rd22]; - sub.s32 %r78, %r76, %r77; - abs.s32 %r79, %r78; - ld.global.u8 %r80, [%rd24+1]; - ld.global.u8 %r81, [%rd22+1]; - sub.s32 %r82, %r80, %r81; - abs.s32 %r83, %r82; - add.s32 %r84, %r83, %r79; - ld.global.u8 %r85, [%rd24+2]; - ld.global.u8 %r86, [%rd22+2]; - sub.s32 %r87, %r85, %r86; - abs.s32 %r88, %r87; - add.s32 %r89, %r84, %r88; - cvt.u16.u32 %rs11, %r89; - cvt.rn.f32.u16 %f5, %rs11; + cvt.rzi.u32.f64 %r75, %fd8; + cvt.u16.u32 %rs12, %r75; + add.s16 %rs13, %rs9, %rs12; + st.global.u16 [%rd3+4], %rs13; + add.s32 %r76, %r163, -1; + mul.wide.u32 %rd22, %r76, 4; + add.s64 %rd23, %rd2, %rd22; + add.s32 %r77, %r164, -1; + mul.wide.u32 %rd24, %r77, 3; + add.s64 %rd25, %rd1, %rd24; + ld.global.u8 %r78, [%rd25]; + ld.global.u8 %r79, [%rd23]; + sub.s32 %r80, %r78, %r79; + abs.s32 %r81, %r80; + ld.global.u8 %r82, [%rd25+1]; + ld.global.u8 %r83, [%rd23+1]; + sub.s32 %r84, %r82, %r83; + abs.s32 %r85, %r84; + add.s32 %r86, %r85, %r81; + ld.global.u8 %r87, [%rd25+2]; + ld.global.u8 %r88, [%rd23+2]; + sub.s32 %r89, %r87, %r88; + abs.s32 %r90, %r89; + add.s32 %r91, %r86, %r90; + cvt.u16.u32 %rs14, %r91; + cvt.rn.f32.u16 %f5, %rs14; cvt.f64.f32 %fd9, %f5; - ld.global.u8 %rs12, [%rd22+3]; - cvt.rn.f32.u16 %f6, %rs12; + ld.global.u8 %rs15, [%rd23+3]; + cvt.rn.f32.u16 %f6, %rs15; cvt.f64.f32 %fd10, %f6; div.rn.f64 %fd11, %fd10, 0d406FE00000000000; mul.f64 %fd12, %fd11, %fd9; - cvt.rzi.u32.f64 %r90, %fd12; - cvt.u16.u32 %rs13, %r90; - ld.global.u16 %rs14, [%rd3+4]; - add.s16 %rs15, %rs14, %rs13; - st.global.u16 [%rd3+4], %rs15; - -$L__BB7_8: - add.s32 %r91, %r91, 1; - setp.lt.u32 %p8, %r91, %r28; - @%p8 bra $L__BB7_3; - -$L__BB7_9: + cvt.rzi.u32.f64 %r92, %fd12; + cvt.u16.u32 %rs16, %r92; + add.s16 %rs17, %rs13, %rs16; + st.global.u16 [%rd3+4], %rs17; + mul.wide.u32 %rd26, %r163, 4; + add.s64 %rd27, %rd2, %rd26; + mul.wide.u32 %rd28, %r164, 3; + add.s64 %rd29, %rd1, %rd28; + ld.global.u8 %r93, [%rd29]; + ld.global.u8 %r94, [%rd27]; + sub.s32 %r95, %r93, %r94; + abs.s32 %r96, %r95; + ld.global.u8 %r97, [%rd29+1]; + ld.global.u8 %r98, [%rd27+1]; + sub.s32 %r99, %r97, %r98; + abs.s32 %r100, %r99; + add.s32 %r101, %r100, %r96; + ld.global.u8 %r102, [%rd29+2]; + ld.global.u8 %r103, [%rd27+2]; + sub.s32 %r104, %r102, %r103; + abs.s32 %r105, %r104; + add.s32 %r106, %r101, %r105; + cvt.u16.u32 %rs18, %r106; + cvt.rn.f32.u16 %f7, %rs18; + cvt.f64.f32 %fd13, %f7; + ld.global.u8 %rs19, [%rd27+3]; + cvt.rn.f32.u16 %f8, %rs19; + cvt.f64.f32 %fd14, %f8; + div.rn.f64 %fd15, %fd14, 0d406FE00000000000; + mul.f64 %fd16, %fd15, %fd13; + cvt.rzi.u32.f64 %r107, %fd16; + cvt.u16.u32 %rs20, %r107; + add.s16 %rs32, %rs17, %rs20; + st.global.u16 [%rd3+4], %rs32; + add.s32 %r164, %r164, 4; + add.s32 %r163, %r163, 4; + add.s32 %r166, %r166, 4; + add.s32 %r108, %r7, %r166; + setp.ne.s32 %p6, %r108, 0; + @%p6 bra $L__BB7_5; + +$L__BB7_6: + @%p7 bra $L__BB7_10; + + add.s32 %r109, %r166, %r11; + mul.wide.u32 %rd30, %r109, 4; + add.s64 %rd31, %rd2, %rd30; + add.s32 %r110, %r12, %r166; + mul.wide.u32 %rd32, %r110, 3; + add.s64 %rd33, %rd1, %rd32; + ld.global.u8 %r111, [%rd33]; + ld.global.u8 %r112, [%rd31]; + sub.s32 %r113, %r111, %r112; + abs.s32 %r114, %r113; + ld.global.u8 %r115, [%rd33+1]; + ld.global.u8 %r116, [%rd31+1]; + sub.s32 %r117, %r115, %r116; + abs.s32 %r118, %r117; + add.s32 %r119, %r118, %r114; + ld.global.u8 %r120, [%rd33+2]; + ld.global.u8 %r121, [%rd31+2]; + sub.s32 %r122, %r120, %r121; + abs.s32 %r123, %r122; + add.s32 %r124, %r119, %r123; + cvt.u16.u32 %rs21, %r124; + cvt.rn.f32.u16 %f9, %rs21; + cvt.f64.f32 %fd17, %f9; + ld.global.u8 %rs22, [%rd31+3]; + cvt.rn.f32.u16 %f10, %rs22; + cvt.f64.f32 %fd18, %f10; + div.rn.f64 %fd19, %fd18, 0d406FE00000000000; + mul.f64 %fd20, %fd19, %fd17; + cvt.rzi.u32.f64 %r125, %fd20; + cvt.u16.u32 %rs23, %r125; + ld.global.u16 %rs24, [%rd3+4]; + add.s16 %rs4, %rs24, %rs23; + st.global.u16 [%rd3+4], %rs4; + @%p8 bra $L__BB7_10; + + add.s32 %r126, %r166, 1; + add.s32 %r127, %r126, %r11; + mul.wide.u32 %rd34, %r127, 4; + add.s64 %rd35, %rd2, %rd34; + add.s32 %r128, %r12, %r126; + mul.wide.u32 %rd36, %r128, 3; + add.s64 %rd37, %rd1, %rd36; + ld.global.u8 %r129, [%rd37]; + ld.global.u8 %r130, [%rd35]; + sub.s32 %r131, %r129, %r130; + abs.s32 %r132, %r131; + ld.global.u8 %r133, [%rd37+1]; + ld.global.u8 %r134, [%rd35+1]; + sub.s32 %r135, %r133, %r134; + abs.s32 %r136, %r135; + add.s32 %r137, %r136, %r132; + ld.global.u8 %r138, [%rd37+2]; + ld.global.u8 %r139, [%rd35+2]; + sub.s32 %r140, %r138, %r139; + abs.s32 %r141, %r140; + add.s32 %r142, %r137, %r141; + cvt.u16.u32 %rs25, %r142; + cvt.rn.f32.u16 %f11, %rs25; + cvt.f64.f32 %fd21, %f11; + ld.global.u8 %rs26, [%rd35+3]; + cvt.rn.f32.u16 %f12, %rs26; + cvt.f64.f32 %fd22, %f12; + div.rn.f64 %fd23, %fd22, 0d406FE00000000000; + mul.f64 %fd24, %fd23, %fd21; + cvt.rzi.u32.f64 %r143, %fd24; + cvt.u16.u32 %rs27, %r143; + add.s16 %rs5, %rs4, %rs27; + st.global.u16 [%rd3+4], %rs5; + @%p9 bra $L__BB7_10; + + add.s32 %r144, %r166, 2; + add.s32 %r145, %r144, %r11; + mul.wide.u32 %rd38, %r145, 4; + add.s64 %rd39, %rd2, %rd38; + add.s32 %r146, %r12, %r144; + mul.wide.u32 %rd40, %r146, 3; + add.s64 %rd41, %rd1, %rd40; + ld.global.u8 %r147, [%rd41]; + ld.global.u8 %r148, [%rd39]; + sub.s32 %r149, %r147, %r148; + abs.s32 %r150, %r149; + ld.global.u8 %r151, [%rd41+1]; + ld.global.u8 %r152, [%rd39+1]; + sub.s32 %r153, %r151, %r152; + abs.s32 %r154, %r153; + add.s32 %r155, %r154, %r150; + ld.global.u8 %r156, [%rd41+2]; + ld.global.u8 %r157, [%rd39+2]; + sub.s32 %r158, %r156, %r157; + abs.s32 %r159, %r158; + add.s32 %r160, %r155, %r159; + cvt.u16.u32 %rs28, %r160; + cvt.rn.f32.u16 %f13, %rs28; + cvt.f64.f32 %fd25, %f13; + ld.global.u8 %rs29, [%rd39+3]; + cvt.rn.f32.u16 %f14, %rs29; + cvt.f64.f32 %fd26, %f14; + div.rn.f64 %fd27, %fd26, 0d406FE00000000000; + mul.f64 %fd28, %fd27, %fd25; + cvt.rzi.u32.f64 %r161, %fd28; + cvt.u16.u32 %rs30, %r161; + add.s16 %rs31, %rs5, %rs30; + st.global.u16 [%rd3+4], %rs31; + +$L__BB7_10: + add.s32 %r162, %r162, 1; + setp.lt.u32 %p10, %r162, %r27; + @%p10 bra $L__BB7_3; + +$L__BB7_11: ret; } diff --git a/vision-gpu/cuda/cuda_release_52.ptx b/vision-gpu/cuda/cuda_release_52.ptx index 8c365db..7dcdaf7 100644 --- a/vision-gpu/cuda/cuda_release_52.ptx +++ b/vision-gpu/cuda/cuda_release_52.ptx @@ -442,27 +442,27 @@ $L__BB4_3: or.pred %p15, %p13, %p14; @%p15 bra $L__BB4_15; - add.s32 %r105, %r1, -3; - add.s32 %r42, %r1, 3; + add.s32 %r105, %r2, -3; + add.s32 %r42, %r2, 3; setp.gt.u32 %p16, %r105, %r42; @%p16 bra $L__BB4_15; $L__BB4_7: - add.s32 %r106, %r2, -3; - add.s32 %r48, %r2, 3; + add.s32 %r106, %r1, -3; + add.s32 %r48, %r1, 3; setp.gt.u32 %p17, %r106, %r48; @%p17 bra $L__BB4_14; $L__BB4_9: or.b32 %r53, %r106, %r105; setp.lt.s32 %p18, %r53, 0; - setp.ge.u32 %p19, %r105, %r9; + setp.ge.u32 %p19, %r106, %r9; or.pred %p20, %p19, %p18; - setp.ge.u32 %p21, %r106, %r10; + setp.ge.u32 %p21, %r105, %r10; or.pred %p22, %p21, %p20; @%p22 bra $L__BB4_13; - mad.lo.s32 %r54, %r106, %r9, %r105; + mad.lo.s32 %r54, %r105, %r9, %r106; mul.wide.u32 %rd7, %r54, 3; add.s64 %rd8, %rd3, %rd7; ld.global.u8 %rs5, [%rd8]; @@ -836,186 +836,324 @@ $L__BB6_13: .param .u32 filter_map_marker_icons_param_6 ) { - .reg .pred %p<9>; - .reg .b16 %rs<17>; - .reg .f32 %f<7>; - .reg .b32 %r<98>; - .reg .f64 %fd<13>; - .reg .b64 %rd<25>; - - - ld.param.u64 %rd6, [filter_map_marker_icons_param_0]; - ld.param.u32 %r27, [filter_map_marker_icons_param_1]; - ld.param.u64 %rd4, [filter_map_marker_icons_param_2]; - ld.param.u64 %rd5, [filter_map_marker_icons_param_3]; - ld.param.u32 %r28, [filter_map_marker_icons_param_4]; - ld.param.u32 %r29, [filter_map_marker_icons_param_5]; - ld.param.u32 %r30, [filter_map_marker_icons_param_6]; - cvta.to.global.u64 %rd1, %rd6; - mov.u32 %r31, %ntid.x; - mov.u32 %r32, %ctaid.x; - mov.u32 %r33, %tid.x; - mad.lo.s32 %r1, %r32, %r31, %r33; - mov.u32 %r34, %ntid.y; - mov.u32 %r35, %ctaid.y; - mov.u32 %r36, %tid.y; - mad.lo.s32 %r2, %r35, %r34, %r36; - setp.ge.u32 %p1, %r1, %r29; - setp.ge.u32 %p2, %r2, %r30; + .reg .pred %p<11>; + .reg .b16 %rs<33>; + .reg .f32 %f<15>; + .reg .b32 %r<167>; + .reg .f64 %fd<29>; + .reg .b64 %rd<42>; + + + ld.param.u64 %rd7, [filter_map_marker_icons_param_0]; + ld.param.u32 %r26, [filter_map_marker_icons_param_1]; + ld.param.u64 %rd5, [filter_map_marker_icons_param_2]; + ld.param.u64 %rd6, [filter_map_marker_icons_param_3]; + ld.param.u32 %r27, [filter_map_marker_icons_param_4]; + ld.param.u32 %r28, [filter_map_marker_icons_param_5]; + ld.param.u32 %r29, [filter_map_marker_icons_param_6]; + cvta.to.global.u64 %rd1, %rd7; + mov.u32 %r30, %ntid.x; + mov.u32 %r31, %ctaid.x; + mov.u32 %r32, %tid.x; + mad.lo.s32 %r1, %r31, %r30, %r32; + mov.u32 %r33, %ntid.y; + mov.u32 %r34, %ctaid.y; + mov.u32 %r35, %tid.y; + mad.lo.s32 %r2, %r34, %r33, %r35; + setp.ge.u32 %p1, %r1, %r28; + setp.ge.u32 %p2, %r2, %r29; or.pred %p3, %p1, %p2; - @%p3 bra $L__BB7_9; - - cvta.to.global.u64 %rd7, %rd5; - mul.wide.u32 %rd8, %r1, 8; - add.s64 %rd9, %rd7, %rd8; - ld.global.u64 %rd10, [%rd9]; - cvta.to.global.u64 %rd2, %rd10; - cvta.to.global.u64 %rd11, %rd4; - mul.wide.u32 %rd12, %r2, 8; - add.s64 %rd3, %rd11, %rd12; + @%p3 bra $L__BB7_11; + + cvta.to.global.u64 %rd8, %rd6; + mul.wide.u32 %rd9, %r1, 8; + add.s64 %rd10, %rd8, %rd9; + ld.global.u64 %rd11, [%rd10]; + cvta.to.global.u64 %rd2, %rd11; + cvta.to.global.u64 %rd12, %rd5; + mul.wide.u32 %rd13, %r2, 8; + add.s64 %rd3, %rd12, %rd13; ld.global.u32 %r3, [%rd3]; - setp.eq.s32 %p4, %r28, 0; - @%p4 bra $L__BB7_9; - - and.b32 %r4, %r28, 1; - shl.b32 %r5, %r28, 1; - add.s32 %r6, %r3, %r27; - shl.b32 %r7, %r27, 1; - sub.s32 %r8, %r4, %r28; - div.u32 %r9, %r3, %r27; - mul.lo.s32 %r38, %r9, %r27; - sub.s32 %r10, %r3, %r38; - mov.u32 %r37, 0; - setp.eq.s32 %p5, %r28, 1; - setp.eq.s32 %p7, %r4, 0; - mov.u32 %r91, %r37; + setp.eq.s32 %p4, %r27, 0; + @%p4 bra $L__BB7_11; + + add.s32 %r4, %r27, -1; + add.s32 %r5, %r3, 3; + and.b32 %r6, %r27, 3; + sub.s32 %r7, %r6, %r27; + div.u32 %r8, %r3, %r26; + mul.lo.s32 %r37, %r8, %r26; + sub.s32 %r9, %r3, %r37; + mov.u32 %r36, 0; + setp.lt.u32 %p5, %r4, 3; + setp.eq.s32 %p7, %r6, 0; + setp.eq.s32 %p8, %r6, 1; + setp.eq.s32 %p9, %r6, 2; + mov.u32 %r162, %r36; $L__BB7_3: - mov.u32 %r97, %r37; + mul.lo.s32 %r11, %r162, %r27; + add.s32 %r39, %r162, %r8; + mad.lo.s32 %r12, %r39, %r26, %r9; + mov.u32 %r166, %r36; @%p5 bra $L__BB7_6; - ld.global.u16 %rs16, [%rd3+4]; - add.s32 %r95, %r28, %r91; - add.s32 %r93, %r6, %r91; - add.s32 %r92, %r3, %r91; - mov.u32 %r94, %r91; - mov.u32 %r97, %r37; + ld.global.u16 %rs32, [%rd3+4]; + mul.lo.s32 %r41, %r26, %r162; + add.s32 %r164, %r5, %r41; + add.s32 %r163, %r11, 3; + add.s32 %r15, %r3, %r41; + add.s32 %r16, %r15, 1; + add.s32 %r17, %r11, 1; + mov.u32 %r166, %r36; $L__BB7_5: - mul.wide.u32 %rd13, %r94, 4; - add.s64 %rd14, %rd2, %rd13; - mul.wide.u32 %rd15, %r92, 3; - add.s64 %rd16, %rd1, %rd15; - ld.global.u8 %r41, [%rd16]; - ld.global.u8 %r42, [%rd14]; - sub.s32 %r43, %r41, %r42; - abs.s32 %r44, %r43; - ld.global.u8 %r45, [%rd16+1]; - ld.global.u8 %r46, [%rd14+1]; - sub.s32 %r47, %r45, %r46; - abs.s32 %r48, %r47; - add.s32 %r49, %r48, %r44; - ld.global.u8 %r50, [%rd16+2]; - ld.global.u8 %r51, [%rd14+2]; - sub.s32 %r52, %r50, %r51; - abs.s32 %r53, %r52; - add.s32 %r54, %r49, %r53; - cvt.u16.u32 %rs4, %r54; - cvt.rn.f32.u16 %f1, %rs4; + add.s32 %r42, %r11, %r166; + mul.wide.u32 %rd14, %r42, 4; + add.s64 %rd15, %rd2, %rd14; + add.s32 %r43, %r15, %r166; + mul.wide.u32 %rd16, %r43, 3; + add.s64 %rd17, %rd1, %rd16; + ld.global.u8 %r44, [%rd17]; + ld.global.u8 %r45, [%rd15]; + sub.s32 %r46, %r44, %r45; + abs.s32 %r47, %r46; + ld.global.u8 %r48, [%rd17+1]; + ld.global.u8 %r49, [%rd15+1]; + sub.s32 %r50, %r48, %r49; + abs.s32 %r51, %r50; + add.s32 %r52, %r51, %r47; + ld.global.u8 %r53, [%rd17+2]; + ld.global.u8 %r54, [%rd15+2]; + sub.s32 %r55, %r53, %r54; + abs.s32 %r56, %r55; + add.s32 %r57, %r52, %r56; + cvt.u16.u32 %rs6, %r57; + cvt.rn.f32.u16 %f1, %rs6; cvt.f64.f32 %fd1, %f1; - ld.global.u8 %rs5, [%rd14+3]; - cvt.rn.f32.u16 %f2, %rs5; + ld.global.u8 %rs7, [%rd15+3]; + cvt.rn.f32.u16 %f2, %rs7; cvt.f64.f32 %fd2, %f2; div.rn.f64 %fd3, %fd2, 0d406FE00000000000; mul.f64 %fd4, %fd3, %fd1; - cvt.rzi.u32.f64 %r55, %fd4; - cvt.u16.u32 %rs6, %r55; - add.s16 %rs7, %rs16, %rs6; - st.global.u16 [%rd3+4], %rs7; - mul.wide.u32 %rd17, %r95, 4; - add.s64 %rd18, %rd2, %rd17; - mul.wide.u32 %rd19, %r93, 3; - add.s64 %rd20, %rd1, %rd19; - ld.global.u8 %r56, [%rd20]; - ld.global.u8 %r57, [%rd18]; - sub.s32 %r58, %r56, %r57; - abs.s32 %r59, %r58; - ld.global.u8 %r60, [%rd20+1]; - ld.global.u8 %r61, [%rd18+1]; - sub.s32 %r62, %r60, %r61; - abs.s32 %r63, %r62; - add.s32 %r64, %r63, %r59; - ld.global.u8 %r65, [%rd20+2]; - ld.global.u8 %r66, [%rd18+2]; + cvt.rzi.u32.f64 %r58, %fd4; + cvt.u16.u32 %rs8, %r58; + add.s16 %rs9, %rs32, %rs8; + st.global.u16 [%rd3+4], %rs9; + add.s32 %r59, %r17, %r166; + mul.wide.u32 %rd18, %r59, 4; + add.s64 %rd19, %rd2, %rd18; + add.s32 %r60, %r16, %r166; + mul.wide.u32 %rd20, %r60, 3; + add.s64 %rd21, %rd1, %rd20; + ld.global.u8 %r61, [%rd21]; + ld.global.u8 %r62, [%rd19]; + sub.s32 %r63, %r61, %r62; + abs.s32 %r64, %r63; + ld.global.u8 %r65, [%rd21+1]; + ld.global.u8 %r66, [%rd19+1]; sub.s32 %r67, %r65, %r66; abs.s32 %r68, %r67; - add.s32 %r69, %r64, %r68; - cvt.u16.u32 %rs8, %r69; - cvt.rn.f32.u16 %f3, %rs8; + add.s32 %r69, %r68, %r64; + ld.global.u8 %r70, [%rd21+2]; + ld.global.u8 %r71, [%rd19+2]; + sub.s32 %r72, %r70, %r71; + abs.s32 %r73, %r72; + add.s32 %r74, %r69, %r73; + cvt.u16.u32 %rs10, %r74; + cvt.rn.f32.u16 %f3, %rs10; cvt.f64.f32 %fd5, %f3; - ld.global.u8 %rs9, [%rd18+3]; - cvt.rn.f32.u16 %f4, %rs9; + ld.global.u8 %rs11, [%rd19+3]; + cvt.rn.f32.u16 %f4, %rs11; cvt.f64.f32 %fd6, %f4; div.rn.f64 %fd7, %fd6, 0d406FE00000000000; mul.f64 %fd8, %fd7, %fd5; - cvt.rzi.u32.f64 %r70, %fd8; - cvt.u16.u32 %rs10, %r70; - add.s16 %rs16, %rs7, %rs10; - st.global.u16 [%rd3+4], %rs16; - add.s32 %r95, %r95, %r5; - add.s32 %r94, %r94, %r5; - add.s32 %r93, %r93, %r7; - add.s32 %r92, %r92, %r7; - add.s32 %r97, %r97, 2; - add.s32 %r71, %r8, %r97; - setp.ne.s32 %p6, %r71, 0; - @%p6 bra $L__BB7_5; - -$L__BB7_6: - @%p7 bra $L__BB7_8; - - mad.lo.s32 %r72, %r97, %r28, %r91; - mul.wide.u32 %rd21, %r72, 4; - add.s64 %rd22, %rd2, %rd21; - add.s32 %r73, %r97, %r9; - add.s32 %r74, %r91, %r10; - mad.lo.s32 %r75, %r73, %r27, %r74; - mul.wide.u32 %rd23, %r75, 3; - add.s64 %rd24, %rd1, %rd23; - ld.global.u8 %r76, [%rd24]; - ld.global.u8 %r77, [%rd22]; - sub.s32 %r78, %r76, %r77; - abs.s32 %r79, %r78; - ld.global.u8 %r80, [%rd24+1]; - ld.global.u8 %r81, [%rd22+1]; - sub.s32 %r82, %r80, %r81; - abs.s32 %r83, %r82; - add.s32 %r84, %r83, %r79; - ld.global.u8 %r85, [%rd24+2]; - ld.global.u8 %r86, [%rd22+2]; - sub.s32 %r87, %r85, %r86; - abs.s32 %r88, %r87; - add.s32 %r89, %r84, %r88; - cvt.u16.u32 %rs11, %r89; - cvt.rn.f32.u16 %f5, %rs11; + cvt.rzi.u32.f64 %r75, %fd8; + cvt.u16.u32 %rs12, %r75; + add.s16 %rs13, %rs9, %rs12; + st.global.u16 [%rd3+4], %rs13; + add.s32 %r76, %r163, -1; + mul.wide.u32 %rd22, %r76, 4; + add.s64 %rd23, %rd2, %rd22; + add.s32 %r77, %r164, -1; + mul.wide.u32 %rd24, %r77, 3; + add.s64 %rd25, %rd1, %rd24; + ld.global.u8 %r78, [%rd25]; + ld.global.u8 %r79, [%rd23]; + sub.s32 %r80, %r78, %r79; + abs.s32 %r81, %r80; + ld.global.u8 %r82, [%rd25+1]; + ld.global.u8 %r83, [%rd23+1]; + sub.s32 %r84, %r82, %r83; + abs.s32 %r85, %r84; + add.s32 %r86, %r85, %r81; + ld.global.u8 %r87, [%rd25+2]; + ld.global.u8 %r88, [%rd23+2]; + sub.s32 %r89, %r87, %r88; + abs.s32 %r90, %r89; + add.s32 %r91, %r86, %r90; + cvt.u16.u32 %rs14, %r91; + cvt.rn.f32.u16 %f5, %rs14; cvt.f64.f32 %fd9, %f5; - ld.global.u8 %rs12, [%rd22+3]; - cvt.rn.f32.u16 %f6, %rs12; + ld.global.u8 %rs15, [%rd23+3]; + cvt.rn.f32.u16 %f6, %rs15; cvt.f64.f32 %fd10, %f6; div.rn.f64 %fd11, %fd10, 0d406FE00000000000; mul.f64 %fd12, %fd11, %fd9; - cvt.rzi.u32.f64 %r90, %fd12; - cvt.u16.u32 %rs13, %r90; - ld.global.u16 %rs14, [%rd3+4]; - add.s16 %rs15, %rs14, %rs13; - st.global.u16 [%rd3+4], %rs15; - -$L__BB7_8: - add.s32 %r91, %r91, 1; - setp.lt.u32 %p8, %r91, %r28; - @%p8 bra $L__BB7_3; - -$L__BB7_9: + cvt.rzi.u32.f64 %r92, %fd12; + cvt.u16.u32 %rs16, %r92; + add.s16 %rs17, %rs13, %rs16; + st.global.u16 [%rd3+4], %rs17; + mul.wide.u32 %rd26, %r163, 4; + add.s64 %rd27, %rd2, %rd26; + mul.wide.u32 %rd28, %r164, 3; + add.s64 %rd29, %rd1, %rd28; + ld.global.u8 %r93, [%rd29]; + ld.global.u8 %r94, [%rd27]; + sub.s32 %r95, %r93, %r94; + abs.s32 %r96, %r95; + ld.global.u8 %r97, [%rd29+1]; + ld.global.u8 %r98, [%rd27+1]; + sub.s32 %r99, %r97, %r98; + abs.s32 %r100, %r99; + add.s32 %r101, %r100, %r96; + ld.global.u8 %r102, [%rd29+2]; + ld.global.u8 %r103, [%rd27+2]; + sub.s32 %r104, %r102, %r103; + abs.s32 %r105, %r104; + add.s32 %r106, %r101, %r105; + cvt.u16.u32 %rs18, %r106; + cvt.rn.f32.u16 %f7, %rs18; + cvt.f64.f32 %fd13, %f7; + ld.global.u8 %rs19, [%rd27+3]; + cvt.rn.f32.u16 %f8, %rs19; + cvt.f64.f32 %fd14, %f8; + div.rn.f64 %fd15, %fd14, 0d406FE00000000000; + mul.f64 %fd16, %fd15, %fd13; + cvt.rzi.u32.f64 %r107, %fd16; + cvt.u16.u32 %rs20, %r107; + add.s16 %rs32, %rs17, %rs20; + st.global.u16 [%rd3+4], %rs32; + add.s32 %r164, %r164, 4; + add.s32 %r163, %r163, 4; + add.s32 %r166, %r166, 4; + add.s32 %r108, %r7, %r166; + setp.ne.s32 %p6, %r108, 0; + @%p6 bra $L__BB7_5; + +$L__BB7_6: + @%p7 bra $L__BB7_10; + + add.s32 %r109, %r166, %r11; + mul.wide.u32 %rd30, %r109, 4; + add.s64 %rd31, %rd2, %rd30; + add.s32 %r110, %r12, %r166; + mul.wide.u32 %rd32, %r110, 3; + add.s64 %rd33, %rd1, %rd32; + ld.global.u8 %r111, [%rd33]; + ld.global.u8 %r112, [%rd31]; + sub.s32 %r113, %r111, %r112; + abs.s32 %r114, %r113; + ld.global.u8 %r115, [%rd33+1]; + ld.global.u8 %r116, [%rd31+1]; + sub.s32 %r117, %r115, %r116; + abs.s32 %r118, %r117; + add.s32 %r119, %r118, %r114; + ld.global.u8 %r120, [%rd33+2]; + ld.global.u8 %r121, [%rd31+2]; + sub.s32 %r122, %r120, %r121; + abs.s32 %r123, %r122; + add.s32 %r124, %r119, %r123; + cvt.u16.u32 %rs21, %r124; + cvt.rn.f32.u16 %f9, %rs21; + cvt.f64.f32 %fd17, %f9; + ld.global.u8 %rs22, [%rd31+3]; + cvt.rn.f32.u16 %f10, %rs22; + cvt.f64.f32 %fd18, %f10; + div.rn.f64 %fd19, %fd18, 0d406FE00000000000; + mul.f64 %fd20, %fd19, %fd17; + cvt.rzi.u32.f64 %r125, %fd20; + cvt.u16.u32 %rs23, %r125; + ld.global.u16 %rs24, [%rd3+4]; + add.s16 %rs4, %rs24, %rs23; + st.global.u16 [%rd3+4], %rs4; + @%p8 bra $L__BB7_10; + + add.s32 %r126, %r166, 1; + add.s32 %r127, %r126, %r11; + mul.wide.u32 %rd34, %r127, 4; + add.s64 %rd35, %rd2, %rd34; + add.s32 %r128, %r12, %r126; + mul.wide.u32 %rd36, %r128, 3; + add.s64 %rd37, %rd1, %rd36; + ld.global.u8 %r129, [%rd37]; + ld.global.u8 %r130, [%rd35]; + sub.s32 %r131, %r129, %r130; + abs.s32 %r132, %r131; + ld.global.u8 %r133, [%rd37+1]; + ld.global.u8 %r134, [%rd35+1]; + sub.s32 %r135, %r133, %r134; + abs.s32 %r136, %r135; + add.s32 %r137, %r136, %r132; + ld.global.u8 %r138, [%rd37+2]; + ld.global.u8 %r139, [%rd35+2]; + sub.s32 %r140, %r138, %r139; + abs.s32 %r141, %r140; + add.s32 %r142, %r137, %r141; + cvt.u16.u32 %rs25, %r142; + cvt.rn.f32.u16 %f11, %rs25; + cvt.f64.f32 %fd21, %f11; + ld.global.u8 %rs26, [%rd35+3]; + cvt.rn.f32.u16 %f12, %rs26; + cvt.f64.f32 %fd22, %f12; + div.rn.f64 %fd23, %fd22, 0d406FE00000000000; + mul.f64 %fd24, %fd23, %fd21; + cvt.rzi.u32.f64 %r143, %fd24; + cvt.u16.u32 %rs27, %r143; + add.s16 %rs5, %rs4, %rs27; + st.global.u16 [%rd3+4], %rs5; + @%p9 bra $L__BB7_10; + + add.s32 %r144, %r166, 2; + add.s32 %r145, %r144, %r11; + mul.wide.u32 %rd38, %r145, 4; + add.s64 %rd39, %rd2, %rd38; + add.s32 %r146, %r12, %r144; + mul.wide.u32 %rd40, %r146, 3; + add.s64 %rd41, %rd1, %rd40; + ld.global.u8 %r147, [%rd41]; + ld.global.u8 %r148, [%rd39]; + sub.s32 %r149, %r147, %r148; + abs.s32 %r150, %r149; + ld.global.u8 %r151, [%rd41+1]; + ld.global.u8 %r152, [%rd39+1]; + sub.s32 %r153, %r151, %r152; + abs.s32 %r154, %r153; + add.s32 %r155, %r154, %r150; + ld.global.u8 %r156, [%rd41+2]; + ld.global.u8 %r157, [%rd39+2]; + sub.s32 %r158, %r156, %r157; + abs.s32 %r159, %r158; + add.s32 %r160, %r155, %r159; + cvt.u16.u32 %rs28, %r160; + cvt.rn.f32.u16 %f13, %rs28; + cvt.f64.f32 %fd25, %f13; + ld.global.u8 %rs29, [%rd39+3]; + cvt.rn.f32.u16 %f14, %rs29; + cvt.f64.f32 %fd26, %f14; + div.rn.f64 %fd27, %fd26, 0d406FE00000000000; + mul.f64 %fd28, %fd27, %fd25; + cvt.rzi.u32.f64 %r161, %fd28; + cvt.u16.u32 %rs30, %r161; + add.s16 %rs31, %rs5, %rs30; + st.global.u16 [%rd3+4], %rs31; + +$L__BB7_10: + add.s32 %r162, %r162, 1; + setp.lt.u32 %p10, %r162, %r27; + @%p10 bra $L__BB7_3; + +$L__BB7_11: ret; } diff --git a/vision-gpu/cuda/cuda_release_61.ptx b/vision-gpu/cuda/cuda_release_61.ptx index 4698e01..af0a018 100644 --- a/vision-gpu/cuda/cuda_release_61.ptx +++ b/vision-gpu/cuda/cuda_release_61.ptx @@ -442,27 +442,27 @@ $L__BB4_3: or.pred %p15, %p13, %p14; @%p15 bra $L__BB4_15; - add.s32 %r105, %r1, -3; - add.s32 %r42, %r1, 3; + add.s32 %r105, %r2, -3; + add.s32 %r42, %r2, 3; setp.gt.u32 %p16, %r105, %r42; @%p16 bra $L__BB4_15; $L__BB4_7: - add.s32 %r106, %r2, -3; - add.s32 %r48, %r2, 3; + add.s32 %r106, %r1, -3; + add.s32 %r48, %r1, 3; setp.gt.u32 %p17, %r106, %r48; @%p17 bra $L__BB4_14; $L__BB4_9: or.b32 %r53, %r106, %r105; setp.lt.s32 %p18, %r53, 0; - setp.ge.u32 %p19, %r105, %r9; + setp.ge.u32 %p19, %r106, %r9; or.pred %p20, %p19, %p18; - setp.ge.u32 %p21, %r106, %r10; + setp.ge.u32 %p21, %r105, %r10; or.pred %p22, %p21, %p20; @%p22 bra $L__BB4_13; - mad.lo.s32 %r54, %r106, %r9, %r105; + mad.lo.s32 %r54, %r105, %r9, %r106; mul.wide.u32 %rd7, %r54, 3; add.s64 %rd8, %rd3, %rd7; ld.global.u8 %rs5, [%rd8]; @@ -836,186 +836,324 @@ $L__BB6_13: .param .u32 filter_map_marker_icons_param_6 ) { - .reg .pred %p<9>; - .reg .b16 %rs<17>; - .reg .f32 %f<7>; - .reg .b32 %r<98>; - .reg .f64 %fd<13>; - .reg .b64 %rd<25>; - - - ld.param.u64 %rd6, [filter_map_marker_icons_param_0]; - ld.param.u32 %r27, [filter_map_marker_icons_param_1]; - ld.param.u64 %rd4, [filter_map_marker_icons_param_2]; - ld.param.u64 %rd5, [filter_map_marker_icons_param_3]; - ld.param.u32 %r28, [filter_map_marker_icons_param_4]; - ld.param.u32 %r29, [filter_map_marker_icons_param_5]; - ld.param.u32 %r30, [filter_map_marker_icons_param_6]; - cvta.to.global.u64 %rd1, %rd6; - mov.u32 %r31, %ntid.x; - mov.u32 %r32, %ctaid.x; - mov.u32 %r33, %tid.x; - mad.lo.s32 %r1, %r32, %r31, %r33; - mov.u32 %r34, %ntid.y; - mov.u32 %r35, %ctaid.y; - mov.u32 %r36, %tid.y; - mad.lo.s32 %r2, %r35, %r34, %r36; - setp.ge.u32 %p1, %r1, %r29; - setp.ge.u32 %p2, %r2, %r30; + .reg .pred %p<11>; + .reg .b16 %rs<33>; + .reg .f32 %f<15>; + .reg .b32 %r<167>; + .reg .f64 %fd<29>; + .reg .b64 %rd<42>; + + + ld.param.u64 %rd7, [filter_map_marker_icons_param_0]; + ld.param.u32 %r26, [filter_map_marker_icons_param_1]; + ld.param.u64 %rd5, [filter_map_marker_icons_param_2]; + ld.param.u64 %rd6, [filter_map_marker_icons_param_3]; + ld.param.u32 %r27, [filter_map_marker_icons_param_4]; + ld.param.u32 %r28, [filter_map_marker_icons_param_5]; + ld.param.u32 %r29, [filter_map_marker_icons_param_6]; + cvta.to.global.u64 %rd1, %rd7; + mov.u32 %r30, %ntid.x; + mov.u32 %r31, %ctaid.x; + mov.u32 %r32, %tid.x; + mad.lo.s32 %r1, %r31, %r30, %r32; + mov.u32 %r33, %ntid.y; + mov.u32 %r34, %ctaid.y; + mov.u32 %r35, %tid.y; + mad.lo.s32 %r2, %r34, %r33, %r35; + setp.ge.u32 %p1, %r1, %r28; + setp.ge.u32 %p2, %r2, %r29; or.pred %p3, %p1, %p2; - @%p3 bra $L__BB7_9; - - cvta.to.global.u64 %rd7, %rd5; - mul.wide.u32 %rd8, %r1, 8; - add.s64 %rd9, %rd7, %rd8; - ld.global.u64 %rd10, [%rd9]; - cvta.to.global.u64 %rd2, %rd10; - cvta.to.global.u64 %rd11, %rd4; - mul.wide.u32 %rd12, %r2, 8; - add.s64 %rd3, %rd11, %rd12; + @%p3 bra $L__BB7_11; + + cvta.to.global.u64 %rd8, %rd6; + mul.wide.u32 %rd9, %r1, 8; + add.s64 %rd10, %rd8, %rd9; + ld.global.u64 %rd11, [%rd10]; + cvta.to.global.u64 %rd2, %rd11; + cvta.to.global.u64 %rd12, %rd5; + mul.wide.u32 %rd13, %r2, 8; + add.s64 %rd3, %rd12, %rd13; ld.global.u32 %r3, [%rd3]; - setp.eq.s32 %p4, %r28, 0; - @%p4 bra $L__BB7_9; - - and.b32 %r4, %r28, 1; - shl.b32 %r5, %r28, 1; - add.s32 %r6, %r3, %r27; - shl.b32 %r7, %r27, 1; - sub.s32 %r8, %r4, %r28; - div.u32 %r9, %r3, %r27; - mul.lo.s32 %r38, %r9, %r27; - sub.s32 %r10, %r3, %r38; - mov.u32 %r37, 0; - setp.eq.s32 %p5, %r28, 1; - setp.eq.s32 %p7, %r4, 0; - mov.u32 %r91, %r37; + setp.eq.s32 %p4, %r27, 0; + @%p4 bra $L__BB7_11; + + add.s32 %r4, %r27, -1; + add.s32 %r5, %r3, 3; + and.b32 %r6, %r27, 3; + sub.s32 %r7, %r6, %r27; + div.u32 %r8, %r3, %r26; + mul.lo.s32 %r37, %r8, %r26; + sub.s32 %r9, %r3, %r37; + mov.u32 %r36, 0; + setp.lt.u32 %p5, %r4, 3; + setp.eq.s32 %p7, %r6, 0; + setp.eq.s32 %p8, %r6, 1; + setp.eq.s32 %p9, %r6, 2; + mov.u32 %r162, %r36; $L__BB7_3: - mov.u32 %r97, %r37; + mul.lo.s32 %r11, %r162, %r27; + add.s32 %r39, %r162, %r8; + mad.lo.s32 %r12, %r39, %r26, %r9; + mov.u32 %r166, %r36; @%p5 bra $L__BB7_6; - ld.global.u16 %rs16, [%rd3+4]; - add.s32 %r95, %r28, %r91; - add.s32 %r93, %r6, %r91; - add.s32 %r92, %r3, %r91; - mov.u32 %r94, %r91; - mov.u32 %r97, %r37; + ld.global.u16 %rs32, [%rd3+4]; + mul.lo.s32 %r41, %r26, %r162; + add.s32 %r164, %r5, %r41; + add.s32 %r163, %r11, 3; + add.s32 %r15, %r3, %r41; + add.s32 %r16, %r15, 1; + add.s32 %r17, %r11, 1; + mov.u32 %r166, %r36; $L__BB7_5: - mul.wide.u32 %rd13, %r94, 4; - add.s64 %rd14, %rd2, %rd13; - mul.wide.u32 %rd15, %r92, 3; - add.s64 %rd16, %rd1, %rd15; - ld.global.u8 %r41, [%rd16]; - ld.global.u8 %r42, [%rd14]; - sub.s32 %r43, %r41, %r42; - abs.s32 %r44, %r43; - ld.global.u8 %r45, [%rd16+1]; - ld.global.u8 %r46, [%rd14+1]; - sub.s32 %r47, %r45, %r46; - abs.s32 %r48, %r47; - add.s32 %r49, %r48, %r44; - ld.global.u8 %r50, [%rd16+2]; - ld.global.u8 %r51, [%rd14+2]; - sub.s32 %r52, %r50, %r51; - abs.s32 %r53, %r52; - add.s32 %r54, %r49, %r53; - cvt.u16.u32 %rs4, %r54; - cvt.rn.f32.u16 %f1, %rs4; + add.s32 %r42, %r11, %r166; + mul.wide.u32 %rd14, %r42, 4; + add.s64 %rd15, %rd2, %rd14; + add.s32 %r43, %r15, %r166; + mul.wide.u32 %rd16, %r43, 3; + add.s64 %rd17, %rd1, %rd16; + ld.global.u8 %r44, [%rd17]; + ld.global.u8 %r45, [%rd15]; + sub.s32 %r46, %r44, %r45; + abs.s32 %r47, %r46; + ld.global.u8 %r48, [%rd17+1]; + ld.global.u8 %r49, [%rd15+1]; + sub.s32 %r50, %r48, %r49; + abs.s32 %r51, %r50; + add.s32 %r52, %r51, %r47; + ld.global.u8 %r53, [%rd17+2]; + ld.global.u8 %r54, [%rd15+2]; + sub.s32 %r55, %r53, %r54; + abs.s32 %r56, %r55; + add.s32 %r57, %r52, %r56; + cvt.u16.u32 %rs6, %r57; + cvt.rn.f32.u16 %f1, %rs6; cvt.f64.f32 %fd1, %f1; - ld.global.u8 %rs5, [%rd14+3]; - cvt.rn.f32.u16 %f2, %rs5; + ld.global.u8 %rs7, [%rd15+3]; + cvt.rn.f32.u16 %f2, %rs7; cvt.f64.f32 %fd2, %f2; div.rn.f64 %fd3, %fd2, 0d406FE00000000000; mul.f64 %fd4, %fd3, %fd1; - cvt.rzi.u32.f64 %r55, %fd4; - cvt.u16.u32 %rs6, %r55; - add.s16 %rs7, %rs16, %rs6; - st.global.u16 [%rd3+4], %rs7; - mul.wide.u32 %rd17, %r95, 4; - add.s64 %rd18, %rd2, %rd17; - mul.wide.u32 %rd19, %r93, 3; - add.s64 %rd20, %rd1, %rd19; - ld.global.u8 %r56, [%rd20]; - ld.global.u8 %r57, [%rd18]; - sub.s32 %r58, %r56, %r57; - abs.s32 %r59, %r58; - ld.global.u8 %r60, [%rd20+1]; - ld.global.u8 %r61, [%rd18+1]; - sub.s32 %r62, %r60, %r61; - abs.s32 %r63, %r62; - add.s32 %r64, %r63, %r59; - ld.global.u8 %r65, [%rd20+2]; - ld.global.u8 %r66, [%rd18+2]; + cvt.rzi.u32.f64 %r58, %fd4; + cvt.u16.u32 %rs8, %r58; + add.s16 %rs9, %rs32, %rs8; + st.global.u16 [%rd3+4], %rs9; + add.s32 %r59, %r17, %r166; + mul.wide.u32 %rd18, %r59, 4; + add.s64 %rd19, %rd2, %rd18; + add.s32 %r60, %r16, %r166; + mul.wide.u32 %rd20, %r60, 3; + add.s64 %rd21, %rd1, %rd20; + ld.global.u8 %r61, [%rd21]; + ld.global.u8 %r62, [%rd19]; + sub.s32 %r63, %r61, %r62; + abs.s32 %r64, %r63; + ld.global.u8 %r65, [%rd21+1]; + ld.global.u8 %r66, [%rd19+1]; sub.s32 %r67, %r65, %r66; abs.s32 %r68, %r67; - add.s32 %r69, %r64, %r68; - cvt.u16.u32 %rs8, %r69; - cvt.rn.f32.u16 %f3, %rs8; + add.s32 %r69, %r68, %r64; + ld.global.u8 %r70, [%rd21+2]; + ld.global.u8 %r71, [%rd19+2]; + sub.s32 %r72, %r70, %r71; + abs.s32 %r73, %r72; + add.s32 %r74, %r69, %r73; + cvt.u16.u32 %rs10, %r74; + cvt.rn.f32.u16 %f3, %rs10; cvt.f64.f32 %fd5, %f3; - ld.global.u8 %rs9, [%rd18+3]; - cvt.rn.f32.u16 %f4, %rs9; + ld.global.u8 %rs11, [%rd19+3]; + cvt.rn.f32.u16 %f4, %rs11; cvt.f64.f32 %fd6, %f4; div.rn.f64 %fd7, %fd6, 0d406FE00000000000; mul.f64 %fd8, %fd7, %fd5; - cvt.rzi.u32.f64 %r70, %fd8; - cvt.u16.u32 %rs10, %r70; - add.s16 %rs16, %rs7, %rs10; - st.global.u16 [%rd3+4], %rs16; - add.s32 %r95, %r95, %r5; - add.s32 %r94, %r94, %r5; - add.s32 %r93, %r93, %r7; - add.s32 %r92, %r92, %r7; - add.s32 %r97, %r97, 2; - add.s32 %r71, %r8, %r97; - setp.ne.s32 %p6, %r71, 0; - @%p6 bra $L__BB7_5; - -$L__BB7_6: - @%p7 bra $L__BB7_8; - - mad.lo.s32 %r72, %r97, %r28, %r91; - mul.wide.u32 %rd21, %r72, 4; - add.s64 %rd22, %rd2, %rd21; - add.s32 %r73, %r97, %r9; - add.s32 %r74, %r91, %r10; - mad.lo.s32 %r75, %r73, %r27, %r74; - mul.wide.u32 %rd23, %r75, 3; - add.s64 %rd24, %rd1, %rd23; - ld.global.u8 %r76, [%rd24]; - ld.global.u8 %r77, [%rd22]; - sub.s32 %r78, %r76, %r77; - abs.s32 %r79, %r78; - ld.global.u8 %r80, [%rd24+1]; - ld.global.u8 %r81, [%rd22+1]; - sub.s32 %r82, %r80, %r81; - abs.s32 %r83, %r82; - add.s32 %r84, %r83, %r79; - ld.global.u8 %r85, [%rd24+2]; - ld.global.u8 %r86, [%rd22+2]; - sub.s32 %r87, %r85, %r86; - abs.s32 %r88, %r87; - add.s32 %r89, %r84, %r88; - cvt.u16.u32 %rs11, %r89; - cvt.rn.f32.u16 %f5, %rs11; + cvt.rzi.u32.f64 %r75, %fd8; + cvt.u16.u32 %rs12, %r75; + add.s16 %rs13, %rs9, %rs12; + st.global.u16 [%rd3+4], %rs13; + add.s32 %r76, %r163, -1; + mul.wide.u32 %rd22, %r76, 4; + add.s64 %rd23, %rd2, %rd22; + add.s32 %r77, %r164, -1; + mul.wide.u32 %rd24, %r77, 3; + add.s64 %rd25, %rd1, %rd24; + ld.global.u8 %r78, [%rd25]; + ld.global.u8 %r79, [%rd23]; + sub.s32 %r80, %r78, %r79; + abs.s32 %r81, %r80; + ld.global.u8 %r82, [%rd25+1]; + ld.global.u8 %r83, [%rd23+1]; + sub.s32 %r84, %r82, %r83; + abs.s32 %r85, %r84; + add.s32 %r86, %r85, %r81; + ld.global.u8 %r87, [%rd25+2]; + ld.global.u8 %r88, [%rd23+2]; + sub.s32 %r89, %r87, %r88; + abs.s32 %r90, %r89; + add.s32 %r91, %r86, %r90; + cvt.u16.u32 %rs14, %r91; + cvt.rn.f32.u16 %f5, %rs14; cvt.f64.f32 %fd9, %f5; - ld.global.u8 %rs12, [%rd22+3]; - cvt.rn.f32.u16 %f6, %rs12; + ld.global.u8 %rs15, [%rd23+3]; + cvt.rn.f32.u16 %f6, %rs15; cvt.f64.f32 %fd10, %f6; div.rn.f64 %fd11, %fd10, 0d406FE00000000000; mul.f64 %fd12, %fd11, %fd9; - cvt.rzi.u32.f64 %r90, %fd12; - cvt.u16.u32 %rs13, %r90; - ld.global.u16 %rs14, [%rd3+4]; - add.s16 %rs15, %rs14, %rs13; - st.global.u16 [%rd3+4], %rs15; - -$L__BB7_8: - add.s32 %r91, %r91, 1; - setp.lt.u32 %p8, %r91, %r28; - @%p8 bra $L__BB7_3; - -$L__BB7_9: + cvt.rzi.u32.f64 %r92, %fd12; + cvt.u16.u32 %rs16, %r92; + add.s16 %rs17, %rs13, %rs16; + st.global.u16 [%rd3+4], %rs17; + mul.wide.u32 %rd26, %r163, 4; + add.s64 %rd27, %rd2, %rd26; + mul.wide.u32 %rd28, %r164, 3; + add.s64 %rd29, %rd1, %rd28; + ld.global.u8 %r93, [%rd29]; + ld.global.u8 %r94, [%rd27]; + sub.s32 %r95, %r93, %r94; + abs.s32 %r96, %r95; + ld.global.u8 %r97, [%rd29+1]; + ld.global.u8 %r98, [%rd27+1]; + sub.s32 %r99, %r97, %r98; + abs.s32 %r100, %r99; + add.s32 %r101, %r100, %r96; + ld.global.u8 %r102, [%rd29+2]; + ld.global.u8 %r103, [%rd27+2]; + sub.s32 %r104, %r102, %r103; + abs.s32 %r105, %r104; + add.s32 %r106, %r101, %r105; + cvt.u16.u32 %rs18, %r106; + cvt.rn.f32.u16 %f7, %rs18; + cvt.f64.f32 %fd13, %f7; + ld.global.u8 %rs19, [%rd27+3]; + cvt.rn.f32.u16 %f8, %rs19; + cvt.f64.f32 %fd14, %f8; + div.rn.f64 %fd15, %fd14, 0d406FE00000000000; + mul.f64 %fd16, %fd15, %fd13; + cvt.rzi.u32.f64 %r107, %fd16; + cvt.u16.u32 %rs20, %r107; + add.s16 %rs32, %rs17, %rs20; + st.global.u16 [%rd3+4], %rs32; + add.s32 %r164, %r164, 4; + add.s32 %r163, %r163, 4; + add.s32 %r166, %r166, 4; + add.s32 %r108, %r7, %r166; + setp.ne.s32 %p6, %r108, 0; + @%p6 bra $L__BB7_5; + +$L__BB7_6: + @%p7 bra $L__BB7_10; + + add.s32 %r109, %r166, %r11; + mul.wide.u32 %rd30, %r109, 4; + add.s64 %rd31, %rd2, %rd30; + add.s32 %r110, %r12, %r166; + mul.wide.u32 %rd32, %r110, 3; + add.s64 %rd33, %rd1, %rd32; + ld.global.u8 %r111, [%rd33]; + ld.global.u8 %r112, [%rd31]; + sub.s32 %r113, %r111, %r112; + abs.s32 %r114, %r113; + ld.global.u8 %r115, [%rd33+1]; + ld.global.u8 %r116, [%rd31+1]; + sub.s32 %r117, %r115, %r116; + abs.s32 %r118, %r117; + add.s32 %r119, %r118, %r114; + ld.global.u8 %r120, [%rd33+2]; + ld.global.u8 %r121, [%rd31+2]; + sub.s32 %r122, %r120, %r121; + abs.s32 %r123, %r122; + add.s32 %r124, %r119, %r123; + cvt.u16.u32 %rs21, %r124; + cvt.rn.f32.u16 %f9, %rs21; + cvt.f64.f32 %fd17, %f9; + ld.global.u8 %rs22, [%rd31+3]; + cvt.rn.f32.u16 %f10, %rs22; + cvt.f64.f32 %fd18, %f10; + div.rn.f64 %fd19, %fd18, 0d406FE00000000000; + mul.f64 %fd20, %fd19, %fd17; + cvt.rzi.u32.f64 %r125, %fd20; + cvt.u16.u32 %rs23, %r125; + ld.global.u16 %rs24, [%rd3+4]; + add.s16 %rs4, %rs24, %rs23; + st.global.u16 [%rd3+4], %rs4; + @%p8 bra $L__BB7_10; + + add.s32 %r126, %r166, 1; + add.s32 %r127, %r126, %r11; + mul.wide.u32 %rd34, %r127, 4; + add.s64 %rd35, %rd2, %rd34; + add.s32 %r128, %r12, %r126; + mul.wide.u32 %rd36, %r128, 3; + add.s64 %rd37, %rd1, %rd36; + ld.global.u8 %r129, [%rd37]; + ld.global.u8 %r130, [%rd35]; + sub.s32 %r131, %r129, %r130; + abs.s32 %r132, %r131; + ld.global.u8 %r133, [%rd37+1]; + ld.global.u8 %r134, [%rd35+1]; + sub.s32 %r135, %r133, %r134; + abs.s32 %r136, %r135; + add.s32 %r137, %r136, %r132; + ld.global.u8 %r138, [%rd37+2]; + ld.global.u8 %r139, [%rd35+2]; + sub.s32 %r140, %r138, %r139; + abs.s32 %r141, %r140; + add.s32 %r142, %r137, %r141; + cvt.u16.u32 %rs25, %r142; + cvt.rn.f32.u16 %f11, %rs25; + cvt.f64.f32 %fd21, %f11; + ld.global.u8 %rs26, [%rd35+3]; + cvt.rn.f32.u16 %f12, %rs26; + cvt.f64.f32 %fd22, %f12; + div.rn.f64 %fd23, %fd22, 0d406FE00000000000; + mul.f64 %fd24, %fd23, %fd21; + cvt.rzi.u32.f64 %r143, %fd24; + cvt.u16.u32 %rs27, %r143; + add.s16 %rs5, %rs4, %rs27; + st.global.u16 [%rd3+4], %rs5; + @%p9 bra $L__BB7_10; + + add.s32 %r144, %r166, 2; + add.s32 %r145, %r144, %r11; + mul.wide.u32 %rd38, %r145, 4; + add.s64 %rd39, %rd2, %rd38; + add.s32 %r146, %r12, %r144; + mul.wide.u32 %rd40, %r146, 3; + add.s64 %rd41, %rd1, %rd40; + ld.global.u8 %r147, [%rd41]; + ld.global.u8 %r148, [%rd39]; + sub.s32 %r149, %r147, %r148; + abs.s32 %r150, %r149; + ld.global.u8 %r151, [%rd41+1]; + ld.global.u8 %r152, [%rd39+1]; + sub.s32 %r153, %r151, %r152; + abs.s32 %r154, %r153; + add.s32 %r155, %r154, %r150; + ld.global.u8 %r156, [%rd41+2]; + ld.global.u8 %r157, [%rd39+2]; + sub.s32 %r158, %r156, %r157; + abs.s32 %r159, %r158; + add.s32 %r160, %r155, %r159; + cvt.u16.u32 %rs28, %r160; + cvt.rn.f32.u16 %f13, %rs28; + cvt.f64.f32 %fd25, %f13; + ld.global.u8 %rs29, [%rd39+3]; + cvt.rn.f32.u16 %f14, %rs29; + cvt.f64.f32 %fd26, %f14; + div.rn.f64 %fd27, %fd26, 0d406FE00000000000; + mul.f64 %fd28, %fd27, %fd25; + cvt.rzi.u32.f64 %r161, %fd28; + cvt.u16.u32 %rs30, %r161; + add.s16 %rs31, %rs5, %rs30; + st.global.u16 [%rd3+4], %rs31; + +$L__BB7_10: + add.s32 %r162, %r162, 1; + setp.lt.u32 %p10, %r162, %r27; + @%p10 bra $L__BB7_3; + +$L__BB7_11: ret; } diff --git a/vision-gpu/cuda/cuda_release_75.ptx b/vision-gpu/cuda/cuda_release_75.ptx index d8ae05b..b05ae88 100644 --- a/vision-gpu/cuda/cuda_release_75.ptx +++ b/vision-gpu/cuda/cuda_release_75.ptx @@ -442,27 +442,27 @@ $L__BB4_3: or.pred %p15, %p13, %p14; @%p15 bra $L__BB4_15; - add.s32 %r105, %r1, -3; - add.s32 %r42, %r1, 3; + add.s32 %r105, %r2, -3; + add.s32 %r42, %r2, 3; setp.gt.u32 %p16, %r105, %r42; @%p16 bra $L__BB4_15; $L__BB4_7: - add.s32 %r106, %r2, -3; - add.s32 %r48, %r2, 3; + add.s32 %r106, %r1, -3; + add.s32 %r48, %r1, 3; setp.gt.u32 %p17, %r106, %r48; @%p17 bra $L__BB4_14; $L__BB4_9: or.b32 %r53, %r106, %r105; setp.lt.s32 %p18, %r53, 0; - setp.ge.u32 %p19, %r105, %r9; + setp.ge.u32 %p19, %r106, %r9; or.pred %p20, %p19, %p18; - setp.ge.u32 %p21, %r106, %r10; + setp.ge.u32 %p21, %r105, %r10; or.pred %p22, %p21, %p20; @%p22 bra $L__BB4_13; - mad.lo.s32 %r54, %r106, %r9, %r105; + mad.lo.s32 %r54, %r105, %r9, %r106; mul.wide.u32 %rd7, %r54, 3; add.s64 %rd8, %rd3, %rd7; ld.global.u8 %rs5, [%rd8]; @@ -836,186 +836,324 @@ $L__BB6_13: .param .u32 filter_map_marker_icons_param_6 ) { - .reg .pred %p<9>; - .reg .b16 %rs<17>; - .reg .f32 %f<7>; - .reg .b32 %r<98>; - .reg .f64 %fd<13>; - .reg .b64 %rd<25>; - - - ld.param.u64 %rd6, [filter_map_marker_icons_param_0]; - ld.param.u32 %r27, [filter_map_marker_icons_param_1]; - ld.param.u64 %rd4, [filter_map_marker_icons_param_2]; - ld.param.u64 %rd5, [filter_map_marker_icons_param_3]; - ld.param.u32 %r28, [filter_map_marker_icons_param_4]; - ld.param.u32 %r29, [filter_map_marker_icons_param_5]; - ld.param.u32 %r30, [filter_map_marker_icons_param_6]; - cvta.to.global.u64 %rd1, %rd6; - mov.u32 %r31, %ntid.x; - mov.u32 %r32, %ctaid.x; - mov.u32 %r33, %tid.x; - mad.lo.s32 %r1, %r32, %r31, %r33; - mov.u32 %r34, %ntid.y; - mov.u32 %r35, %ctaid.y; - mov.u32 %r36, %tid.y; - mad.lo.s32 %r2, %r35, %r34, %r36; - setp.ge.u32 %p1, %r1, %r29; - setp.ge.u32 %p2, %r2, %r30; + .reg .pred %p<11>; + .reg .b16 %rs<33>; + .reg .f32 %f<15>; + .reg .b32 %r<167>; + .reg .f64 %fd<29>; + .reg .b64 %rd<42>; + + + ld.param.u64 %rd7, [filter_map_marker_icons_param_0]; + ld.param.u32 %r26, [filter_map_marker_icons_param_1]; + ld.param.u64 %rd5, [filter_map_marker_icons_param_2]; + ld.param.u64 %rd6, [filter_map_marker_icons_param_3]; + ld.param.u32 %r27, [filter_map_marker_icons_param_4]; + ld.param.u32 %r28, [filter_map_marker_icons_param_5]; + ld.param.u32 %r29, [filter_map_marker_icons_param_6]; + cvta.to.global.u64 %rd1, %rd7; + mov.u32 %r30, %ntid.x; + mov.u32 %r31, %ctaid.x; + mov.u32 %r32, %tid.x; + mad.lo.s32 %r1, %r31, %r30, %r32; + mov.u32 %r33, %ntid.y; + mov.u32 %r34, %ctaid.y; + mov.u32 %r35, %tid.y; + mad.lo.s32 %r2, %r34, %r33, %r35; + setp.ge.u32 %p1, %r1, %r28; + setp.ge.u32 %p2, %r2, %r29; or.pred %p3, %p1, %p2; - @%p3 bra $L__BB7_9; - - cvta.to.global.u64 %rd7, %rd5; - mul.wide.u32 %rd8, %r1, 8; - add.s64 %rd9, %rd7, %rd8; - ld.global.u64 %rd10, [%rd9]; - cvta.to.global.u64 %rd2, %rd10; - cvta.to.global.u64 %rd11, %rd4; - mul.wide.u32 %rd12, %r2, 8; - add.s64 %rd3, %rd11, %rd12; + @%p3 bra $L__BB7_11; + + cvta.to.global.u64 %rd8, %rd6; + mul.wide.u32 %rd9, %r1, 8; + add.s64 %rd10, %rd8, %rd9; + ld.global.u64 %rd11, [%rd10]; + cvta.to.global.u64 %rd2, %rd11; + cvta.to.global.u64 %rd12, %rd5; + mul.wide.u32 %rd13, %r2, 8; + add.s64 %rd3, %rd12, %rd13; ld.global.u32 %r3, [%rd3]; - setp.eq.s32 %p4, %r28, 0; - @%p4 bra $L__BB7_9; - - and.b32 %r4, %r28, 1; - shl.b32 %r5, %r28, 1; - add.s32 %r6, %r3, %r27; - shl.b32 %r7, %r27, 1; - sub.s32 %r8, %r4, %r28; - div.u32 %r9, %r3, %r27; - mul.lo.s32 %r38, %r9, %r27; - sub.s32 %r10, %r3, %r38; - mov.u32 %r37, 0; - setp.eq.s32 %p5, %r28, 1; - setp.eq.s32 %p7, %r4, 0; - mov.u32 %r91, %r37; + setp.eq.s32 %p4, %r27, 0; + @%p4 bra $L__BB7_11; + + add.s32 %r4, %r27, -1; + add.s32 %r5, %r3, 3; + and.b32 %r6, %r27, 3; + sub.s32 %r7, %r6, %r27; + div.u32 %r8, %r3, %r26; + mul.lo.s32 %r37, %r8, %r26; + sub.s32 %r9, %r3, %r37; + mov.u32 %r36, 0; + setp.lt.u32 %p5, %r4, 3; + setp.eq.s32 %p7, %r6, 0; + setp.eq.s32 %p8, %r6, 1; + setp.eq.s32 %p9, %r6, 2; + mov.u32 %r162, %r36; $L__BB7_3: - mov.u32 %r97, %r37; + mul.lo.s32 %r11, %r162, %r27; + add.s32 %r39, %r162, %r8; + mad.lo.s32 %r12, %r39, %r26, %r9; + mov.u32 %r166, %r36; @%p5 bra $L__BB7_6; - ld.global.u16 %rs16, [%rd3+4]; - add.s32 %r95, %r28, %r91; - add.s32 %r93, %r6, %r91; - add.s32 %r92, %r3, %r91; - mov.u32 %r94, %r91; - mov.u32 %r97, %r37; + ld.global.u16 %rs32, [%rd3+4]; + mul.lo.s32 %r41, %r26, %r162; + add.s32 %r164, %r5, %r41; + add.s32 %r163, %r11, 3; + add.s32 %r15, %r3, %r41; + add.s32 %r16, %r15, 1; + add.s32 %r17, %r11, 1; + mov.u32 %r166, %r36; $L__BB7_5: - mul.wide.u32 %rd13, %r94, 4; - add.s64 %rd14, %rd2, %rd13; - mul.wide.u32 %rd15, %r92, 3; - add.s64 %rd16, %rd1, %rd15; - ld.global.u8 %r41, [%rd16]; - ld.global.u8 %r42, [%rd14]; - sub.s32 %r43, %r41, %r42; - abs.s32 %r44, %r43; - ld.global.u8 %r45, [%rd16+1]; - ld.global.u8 %r46, [%rd14+1]; - sub.s32 %r47, %r45, %r46; - abs.s32 %r48, %r47; - add.s32 %r49, %r48, %r44; - ld.global.u8 %r50, [%rd16+2]; - ld.global.u8 %r51, [%rd14+2]; - sub.s32 %r52, %r50, %r51; - abs.s32 %r53, %r52; - add.s32 %r54, %r49, %r53; - cvt.u16.u32 %rs4, %r54; - cvt.rn.f32.u16 %f1, %rs4; + add.s32 %r42, %r11, %r166; + mul.wide.u32 %rd14, %r42, 4; + add.s64 %rd15, %rd2, %rd14; + add.s32 %r43, %r15, %r166; + mul.wide.u32 %rd16, %r43, 3; + add.s64 %rd17, %rd1, %rd16; + ld.global.u8 %r44, [%rd17]; + ld.global.u8 %r45, [%rd15]; + sub.s32 %r46, %r44, %r45; + abs.s32 %r47, %r46; + ld.global.u8 %r48, [%rd17+1]; + ld.global.u8 %r49, [%rd15+1]; + sub.s32 %r50, %r48, %r49; + abs.s32 %r51, %r50; + add.s32 %r52, %r51, %r47; + ld.global.u8 %r53, [%rd17+2]; + ld.global.u8 %r54, [%rd15+2]; + sub.s32 %r55, %r53, %r54; + abs.s32 %r56, %r55; + add.s32 %r57, %r52, %r56; + cvt.u16.u32 %rs6, %r57; + cvt.rn.f32.u16 %f1, %rs6; cvt.f64.f32 %fd1, %f1; - ld.global.u8 %rs5, [%rd14+3]; - cvt.rn.f32.u16 %f2, %rs5; + ld.global.u8 %rs7, [%rd15+3]; + cvt.rn.f32.u16 %f2, %rs7; cvt.f64.f32 %fd2, %f2; div.rn.f64 %fd3, %fd2, 0d406FE00000000000; mul.f64 %fd4, %fd3, %fd1; - cvt.rzi.u32.f64 %r55, %fd4; - cvt.u16.u32 %rs6, %r55; - add.s16 %rs7, %rs16, %rs6; - st.global.u16 [%rd3+4], %rs7; - mul.wide.u32 %rd17, %r95, 4; - add.s64 %rd18, %rd2, %rd17; - mul.wide.u32 %rd19, %r93, 3; - add.s64 %rd20, %rd1, %rd19; - ld.global.u8 %r56, [%rd20]; - ld.global.u8 %r57, [%rd18]; - sub.s32 %r58, %r56, %r57; - abs.s32 %r59, %r58; - ld.global.u8 %r60, [%rd20+1]; - ld.global.u8 %r61, [%rd18+1]; - sub.s32 %r62, %r60, %r61; - abs.s32 %r63, %r62; - add.s32 %r64, %r63, %r59; - ld.global.u8 %r65, [%rd20+2]; - ld.global.u8 %r66, [%rd18+2]; + cvt.rzi.u32.f64 %r58, %fd4; + cvt.u16.u32 %rs8, %r58; + add.s16 %rs9, %rs32, %rs8; + st.global.u16 [%rd3+4], %rs9; + add.s32 %r59, %r17, %r166; + mul.wide.u32 %rd18, %r59, 4; + add.s64 %rd19, %rd2, %rd18; + add.s32 %r60, %r16, %r166; + mul.wide.u32 %rd20, %r60, 3; + add.s64 %rd21, %rd1, %rd20; + ld.global.u8 %r61, [%rd21]; + ld.global.u8 %r62, [%rd19]; + sub.s32 %r63, %r61, %r62; + abs.s32 %r64, %r63; + ld.global.u8 %r65, [%rd21+1]; + ld.global.u8 %r66, [%rd19+1]; sub.s32 %r67, %r65, %r66; abs.s32 %r68, %r67; - add.s32 %r69, %r64, %r68; - cvt.u16.u32 %rs8, %r69; - cvt.rn.f32.u16 %f3, %rs8; + add.s32 %r69, %r68, %r64; + ld.global.u8 %r70, [%rd21+2]; + ld.global.u8 %r71, [%rd19+2]; + sub.s32 %r72, %r70, %r71; + abs.s32 %r73, %r72; + add.s32 %r74, %r69, %r73; + cvt.u16.u32 %rs10, %r74; + cvt.rn.f32.u16 %f3, %rs10; cvt.f64.f32 %fd5, %f3; - ld.global.u8 %rs9, [%rd18+3]; - cvt.rn.f32.u16 %f4, %rs9; + ld.global.u8 %rs11, [%rd19+3]; + cvt.rn.f32.u16 %f4, %rs11; cvt.f64.f32 %fd6, %f4; div.rn.f64 %fd7, %fd6, 0d406FE00000000000; mul.f64 %fd8, %fd7, %fd5; - cvt.rzi.u32.f64 %r70, %fd8; - cvt.u16.u32 %rs10, %r70; - add.s16 %rs16, %rs7, %rs10; - st.global.u16 [%rd3+4], %rs16; - add.s32 %r95, %r95, %r5; - add.s32 %r94, %r94, %r5; - add.s32 %r93, %r93, %r7; - add.s32 %r92, %r92, %r7; - add.s32 %r97, %r97, 2; - add.s32 %r71, %r8, %r97; - setp.ne.s32 %p6, %r71, 0; - @%p6 bra $L__BB7_5; - -$L__BB7_6: - @%p7 bra $L__BB7_8; - - mad.lo.s32 %r72, %r97, %r28, %r91; - mul.wide.u32 %rd21, %r72, 4; - add.s64 %rd22, %rd2, %rd21; - add.s32 %r73, %r97, %r9; - add.s32 %r74, %r91, %r10; - mad.lo.s32 %r75, %r73, %r27, %r74; - mul.wide.u32 %rd23, %r75, 3; - add.s64 %rd24, %rd1, %rd23; - ld.global.u8 %r76, [%rd24]; - ld.global.u8 %r77, [%rd22]; - sub.s32 %r78, %r76, %r77; - abs.s32 %r79, %r78; - ld.global.u8 %r80, [%rd24+1]; - ld.global.u8 %r81, [%rd22+1]; - sub.s32 %r82, %r80, %r81; - abs.s32 %r83, %r82; - add.s32 %r84, %r83, %r79; - ld.global.u8 %r85, [%rd24+2]; - ld.global.u8 %r86, [%rd22+2]; - sub.s32 %r87, %r85, %r86; - abs.s32 %r88, %r87; - add.s32 %r89, %r84, %r88; - cvt.u16.u32 %rs11, %r89; - cvt.rn.f32.u16 %f5, %rs11; + cvt.rzi.u32.f64 %r75, %fd8; + cvt.u16.u32 %rs12, %r75; + add.s16 %rs13, %rs9, %rs12; + st.global.u16 [%rd3+4], %rs13; + add.s32 %r76, %r163, -1; + mul.wide.u32 %rd22, %r76, 4; + add.s64 %rd23, %rd2, %rd22; + add.s32 %r77, %r164, -1; + mul.wide.u32 %rd24, %r77, 3; + add.s64 %rd25, %rd1, %rd24; + ld.global.u8 %r78, [%rd25]; + ld.global.u8 %r79, [%rd23]; + sub.s32 %r80, %r78, %r79; + abs.s32 %r81, %r80; + ld.global.u8 %r82, [%rd25+1]; + ld.global.u8 %r83, [%rd23+1]; + sub.s32 %r84, %r82, %r83; + abs.s32 %r85, %r84; + add.s32 %r86, %r85, %r81; + ld.global.u8 %r87, [%rd25+2]; + ld.global.u8 %r88, [%rd23+2]; + sub.s32 %r89, %r87, %r88; + abs.s32 %r90, %r89; + add.s32 %r91, %r86, %r90; + cvt.u16.u32 %rs14, %r91; + cvt.rn.f32.u16 %f5, %rs14; cvt.f64.f32 %fd9, %f5; - ld.global.u8 %rs12, [%rd22+3]; - cvt.rn.f32.u16 %f6, %rs12; + ld.global.u8 %rs15, [%rd23+3]; + cvt.rn.f32.u16 %f6, %rs15; cvt.f64.f32 %fd10, %f6; div.rn.f64 %fd11, %fd10, 0d406FE00000000000; mul.f64 %fd12, %fd11, %fd9; - cvt.rzi.u32.f64 %r90, %fd12; - cvt.u16.u32 %rs13, %r90; - ld.global.u16 %rs14, [%rd3+4]; - add.s16 %rs15, %rs14, %rs13; - st.global.u16 [%rd3+4], %rs15; - -$L__BB7_8: - add.s32 %r91, %r91, 1; - setp.lt.u32 %p8, %r91, %r28; - @%p8 bra $L__BB7_3; - -$L__BB7_9: + cvt.rzi.u32.f64 %r92, %fd12; + cvt.u16.u32 %rs16, %r92; + add.s16 %rs17, %rs13, %rs16; + st.global.u16 [%rd3+4], %rs17; + mul.wide.u32 %rd26, %r163, 4; + add.s64 %rd27, %rd2, %rd26; + mul.wide.u32 %rd28, %r164, 3; + add.s64 %rd29, %rd1, %rd28; + ld.global.u8 %r93, [%rd29]; + ld.global.u8 %r94, [%rd27]; + sub.s32 %r95, %r93, %r94; + abs.s32 %r96, %r95; + ld.global.u8 %r97, [%rd29+1]; + ld.global.u8 %r98, [%rd27+1]; + sub.s32 %r99, %r97, %r98; + abs.s32 %r100, %r99; + add.s32 %r101, %r100, %r96; + ld.global.u8 %r102, [%rd29+2]; + ld.global.u8 %r103, [%rd27+2]; + sub.s32 %r104, %r102, %r103; + abs.s32 %r105, %r104; + add.s32 %r106, %r101, %r105; + cvt.u16.u32 %rs18, %r106; + cvt.rn.f32.u16 %f7, %rs18; + cvt.f64.f32 %fd13, %f7; + ld.global.u8 %rs19, [%rd27+3]; + cvt.rn.f32.u16 %f8, %rs19; + cvt.f64.f32 %fd14, %f8; + div.rn.f64 %fd15, %fd14, 0d406FE00000000000; + mul.f64 %fd16, %fd15, %fd13; + cvt.rzi.u32.f64 %r107, %fd16; + cvt.u16.u32 %rs20, %r107; + add.s16 %rs32, %rs17, %rs20; + st.global.u16 [%rd3+4], %rs32; + add.s32 %r164, %r164, 4; + add.s32 %r163, %r163, 4; + add.s32 %r166, %r166, 4; + add.s32 %r108, %r7, %r166; + setp.ne.s32 %p6, %r108, 0; + @%p6 bra $L__BB7_5; + +$L__BB7_6: + @%p7 bra $L__BB7_10; + + add.s32 %r109, %r166, %r11; + mul.wide.u32 %rd30, %r109, 4; + add.s64 %rd31, %rd2, %rd30; + add.s32 %r110, %r12, %r166; + mul.wide.u32 %rd32, %r110, 3; + add.s64 %rd33, %rd1, %rd32; + ld.global.u8 %r111, [%rd33]; + ld.global.u8 %r112, [%rd31]; + sub.s32 %r113, %r111, %r112; + abs.s32 %r114, %r113; + ld.global.u8 %r115, [%rd33+1]; + ld.global.u8 %r116, [%rd31+1]; + sub.s32 %r117, %r115, %r116; + abs.s32 %r118, %r117; + add.s32 %r119, %r118, %r114; + ld.global.u8 %r120, [%rd33+2]; + ld.global.u8 %r121, [%rd31+2]; + sub.s32 %r122, %r120, %r121; + abs.s32 %r123, %r122; + add.s32 %r124, %r119, %r123; + cvt.u16.u32 %rs21, %r124; + cvt.rn.f32.u16 %f9, %rs21; + cvt.f64.f32 %fd17, %f9; + ld.global.u8 %rs22, [%rd31+3]; + cvt.rn.f32.u16 %f10, %rs22; + cvt.f64.f32 %fd18, %f10; + div.rn.f64 %fd19, %fd18, 0d406FE00000000000; + mul.f64 %fd20, %fd19, %fd17; + cvt.rzi.u32.f64 %r125, %fd20; + cvt.u16.u32 %rs23, %r125; + ld.global.u16 %rs24, [%rd3+4]; + add.s16 %rs4, %rs24, %rs23; + st.global.u16 [%rd3+4], %rs4; + @%p8 bra $L__BB7_10; + + add.s32 %r126, %r166, 1; + add.s32 %r127, %r126, %r11; + mul.wide.u32 %rd34, %r127, 4; + add.s64 %rd35, %rd2, %rd34; + add.s32 %r128, %r12, %r126; + mul.wide.u32 %rd36, %r128, 3; + add.s64 %rd37, %rd1, %rd36; + ld.global.u8 %r129, [%rd37]; + ld.global.u8 %r130, [%rd35]; + sub.s32 %r131, %r129, %r130; + abs.s32 %r132, %r131; + ld.global.u8 %r133, [%rd37+1]; + ld.global.u8 %r134, [%rd35+1]; + sub.s32 %r135, %r133, %r134; + abs.s32 %r136, %r135; + add.s32 %r137, %r136, %r132; + ld.global.u8 %r138, [%rd37+2]; + ld.global.u8 %r139, [%rd35+2]; + sub.s32 %r140, %r138, %r139; + abs.s32 %r141, %r140; + add.s32 %r142, %r137, %r141; + cvt.u16.u32 %rs25, %r142; + cvt.rn.f32.u16 %f11, %rs25; + cvt.f64.f32 %fd21, %f11; + ld.global.u8 %rs26, [%rd35+3]; + cvt.rn.f32.u16 %f12, %rs26; + cvt.f64.f32 %fd22, %f12; + div.rn.f64 %fd23, %fd22, 0d406FE00000000000; + mul.f64 %fd24, %fd23, %fd21; + cvt.rzi.u32.f64 %r143, %fd24; + cvt.u16.u32 %rs27, %r143; + add.s16 %rs5, %rs4, %rs27; + st.global.u16 [%rd3+4], %rs5; + @%p9 bra $L__BB7_10; + + add.s32 %r144, %r166, 2; + add.s32 %r145, %r144, %r11; + mul.wide.u32 %rd38, %r145, 4; + add.s64 %rd39, %rd2, %rd38; + add.s32 %r146, %r12, %r144; + mul.wide.u32 %rd40, %r146, 3; + add.s64 %rd41, %rd1, %rd40; + ld.global.u8 %r147, [%rd41]; + ld.global.u8 %r148, [%rd39]; + sub.s32 %r149, %r147, %r148; + abs.s32 %r150, %r149; + ld.global.u8 %r151, [%rd41+1]; + ld.global.u8 %r152, [%rd39+1]; + sub.s32 %r153, %r151, %r152; + abs.s32 %r154, %r153; + add.s32 %r155, %r154, %r150; + ld.global.u8 %r156, [%rd41+2]; + ld.global.u8 %r157, [%rd39+2]; + sub.s32 %r158, %r156, %r157; + abs.s32 %r159, %r158; + add.s32 %r160, %r155, %r159; + cvt.u16.u32 %rs28, %r160; + cvt.rn.f32.u16 %f13, %rs28; + cvt.f64.f32 %fd25, %f13; + ld.global.u8 %rs29, [%rd39+3]; + cvt.rn.f32.u16 %f14, %rs29; + cvt.f64.f32 %fd26, %f14; + div.rn.f64 %fd27, %fd26, 0d406FE00000000000; + mul.f64 %fd28, %fd27, %fd25; + cvt.rzi.u32.f64 %r161, %fd28; + cvt.u16.u32 %rs30, %r161; + add.s16 %rs31, %rs5, %rs30; + st.global.u16 [%rd3+4], %rs31; + +$L__BB7_10: + add.s32 %r162, %r162, 1; + setp.lt.u32 %p10, %r162, %r27; + @%p10 bra $L__BB7_3; + +$L__BB7_11: ret; } diff --git a/vision-gpu/cuda/cuda_release_86.ptx b/vision-gpu/cuda/cuda_release_86.ptx index 29c48c1..4bc9481 100644 --- a/vision-gpu/cuda/cuda_release_86.ptx +++ b/vision-gpu/cuda/cuda_release_86.ptx @@ -442,27 +442,27 @@ $L__BB4_3: or.pred %p15, %p13, %p14; @%p15 bra $L__BB4_15; - add.s32 %r105, %r1, -3; - add.s32 %r42, %r1, 3; + add.s32 %r105, %r2, -3; + add.s32 %r42, %r2, 3; setp.gt.u32 %p16, %r105, %r42; @%p16 bra $L__BB4_15; $L__BB4_7: - add.s32 %r106, %r2, -3; - add.s32 %r48, %r2, 3; + add.s32 %r106, %r1, -3; + add.s32 %r48, %r1, 3; setp.gt.u32 %p17, %r106, %r48; @%p17 bra $L__BB4_14; $L__BB4_9: or.b32 %r53, %r106, %r105; setp.lt.s32 %p18, %r53, 0; - setp.ge.u32 %p19, %r105, %r9; + setp.ge.u32 %p19, %r106, %r9; or.pred %p20, %p19, %p18; - setp.ge.u32 %p21, %r106, %r10; + setp.ge.u32 %p21, %r105, %r10; or.pred %p22, %p21, %p20; @%p22 bra $L__BB4_13; - mad.lo.s32 %r54, %r106, %r9, %r105; + mad.lo.s32 %r54, %r105, %r9, %r106; mul.wide.u32 %rd7, %r54, 3; add.s64 %rd8, %rd3, %rd7; ld.global.u8 %rs5, [%rd8]; @@ -836,186 +836,324 @@ $L__BB6_13: .param .u32 filter_map_marker_icons_param_6 ) { - .reg .pred %p<9>; - .reg .b16 %rs<17>; - .reg .f32 %f<7>; - .reg .b32 %r<98>; - .reg .f64 %fd<13>; - .reg .b64 %rd<25>; - - - ld.param.u64 %rd6, [filter_map_marker_icons_param_0]; - ld.param.u32 %r27, [filter_map_marker_icons_param_1]; - ld.param.u64 %rd4, [filter_map_marker_icons_param_2]; - ld.param.u64 %rd5, [filter_map_marker_icons_param_3]; - ld.param.u32 %r28, [filter_map_marker_icons_param_4]; - ld.param.u32 %r29, [filter_map_marker_icons_param_5]; - ld.param.u32 %r30, [filter_map_marker_icons_param_6]; - cvta.to.global.u64 %rd1, %rd6; - mov.u32 %r31, %ntid.x; - mov.u32 %r32, %ctaid.x; - mov.u32 %r33, %tid.x; - mad.lo.s32 %r1, %r32, %r31, %r33; - mov.u32 %r34, %ntid.y; - mov.u32 %r35, %ctaid.y; - mov.u32 %r36, %tid.y; - mad.lo.s32 %r2, %r35, %r34, %r36; - setp.ge.u32 %p1, %r1, %r29; - setp.ge.u32 %p2, %r2, %r30; + .reg .pred %p<11>; + .reg .b16 %rs<33>; + .reg .f32 %f<15>; + .reg .b32 %r<167>; + .reg .f64 %fd<29>; + .reg .b64 %rd<42>; + + + ld.param.u64 %rd7, [filter_map_marker_icons_param_0]; + ld.param.u32 %r26, [filter_map_marker_icons_param_1]; + ld.param.u64 %rd5, [filter_map_marker_icons_param_2]; + ld.param.u64 %rd6, [filter_map_marker_icons_param_3]; + ld.param.u32 %r27, [filter_map_marker_icons_param_4]; + ld.param.u32 %r28, [filter_map_marker_icons_param_5]; + ld.param.u32 %r29, [filter_map_marker_icons_param_6]; + cvta.to.global.u64 %rd1, %rd7; + mov.u32 %r30, %ntid.x; + mov.u32 %r31, %ctaid.x; + mov.u32 %r32, %tid.x; + mad.lo.s32 %r1, %r31, %r30, %r32; + mov.u32 %r33, %ntid.y; + mov.u32 %r34, %ctaid.y; + mov.u32 %r35, %tid.y; + mad.lo.s32 %r2, %r34, %r33, %r35; + setp.ge.u32 %p1, %r1, %r28; + setp.ge.u32 %p2, %r2, %r29; or.pred %p3, %p1, %p2; - @%p3 bra $L__BB7_9; - - cvta.to.global.u64 %rd7, %rd5; - mul.wide.u32 %rd8, %r1, 8; - add.s64 %rd9, %rd7, %rd8; - ld.global.u64 %rd10, [%rd9]; - cvta.to.global.u64 %rd2, %rd10; - cvta.to.global.u64 %rd11, %rd4; - mul.wide.u32 %rd12, %r2, 8; - add.s64 %rd3, %rd11, %rd12; + @%p3 bra $L__BB7_11; + + cvta.to.global.u64 %rd8, %rd6; + mul.wide.u32 %rd9, %r1, 8; + add.s64 %rd10, %rd8, %rd9; + ld.global.u64 %rd11, [%rd10]; + cvta.to.global.u64 %rd2, %rd11; + cvta.to.global.u64 %rd12, %rd5; + mul.wide.u32 %rd13, %r2, 8; + add.s64 %rd3, %rd12, %rd13; ld.global.u32 %r3, [%rd3]; - setp.eq.s32 %p4, %r28, 0; - @%p4 bra $L__BB7_9; - - and.b32 %r4, %r28, 1; - shl.b32 %r5, %r28, 1; - add.s32 %r6, %r3, %r27; - shl.b32 %r7, %r27, 1; - sub.s32 %r8, %r4, %r28; - div.u32 %r9, %r3, %r27; - mul.lo.s32 %r38, %r9, %r27; - sub.s32 %r10, %r3, %r38; - mov.u32 %r37, 0; - setp.eq.s32 %p5, %r28, 1; - setp.eq.s32 %p7, %r4, 0; - mov.u32 %r91, %r37; + setp.eq.s32 %p4, %r27, 0; + @%p4 bra $L__BB7_11; + + add.s32 %r4, %r27, -1; + add.s32 %r5, %r3, 3; + and.b32 %r6, %r27, 3; + sub.s32 %r7, %r6, %r27; + div.u32 %r8, %r3, %r26; + mul.lo.s32 %r37, %r8, %r26; + sub.s32 %r9, %r3, %r37; + mov.u32 %r36, 0; + setp.lt.u32 %p5, %r4, 3; + setp.eq.s32 %p7, %r6, 0; + setp.eq.s32 %p8, %r6, 1; + setp.eq.s32 %p9, %r6, 2; + mov.u32 %r162, %r36; $L__BB7_3: - mov.u32 %r97, %r37; + mul.lo.s32 %r11, %r162, %r27; + add.s32 %r39, %r162, %r8; + mad.lo.s32 %r12, %r39, %r26, %r9; + mov.u32 %r166, %r36; @%p5 bra $L__BB7_6; - ld.global.u16 %rs16, [%rd3+4]; - add.s32 %r95, %r28, %r91; - add.s32 %r93, %r6, %r91; - add.s32 %r92, %r3, %r91; - mov.u32 %r94, %r91; - mov.u32 %r97, %r37; + ld.global.u16 %rs32, [%rd3+4]; + mul.lo.s32 %r41, %r26, %r162; + add.s32 %r164, %r5, %r41; + add.s32 %r163, %r11, 3; + add.s32 %r15, %r3, %r41; + add.s32 %r16, %r15, 1; + add.s32 %r17, %r11, 1; + mov.u32 %r166, %r36; $L__BB7_5: - mul.wide.u32 %rd13, %r94, 4; - add.s64 %rd14, %rd2, %rd13; - mul.wide.u32 %rd15, %r92, 3; - add.s64 %rd16, %rd1, %rd15; - ld.global.u8 %r41, [%rd16]; - ld.global.u8 %r42, [%rd14]; - sub.s32 %r43, %r41, %r42; - abs.s32 %r44, %r43; - ld.global.u8 %r45, [%rd16+1]; - ld.global.u8 %r46, [%rd14+1]; - sub.s32 %r47, %r45, %r46; - abs.s32 %r48, %r47; - add.s32 %r49, %r48, %r44; - ld.global.u8 %r50, [%rd16+2]; - ld.global.u8 %r51, [%rd14+2]; - sub.s32 %r52, %r50, %r51; - abs.s32 %r53, %r52; - add.s32 %r54, %r49, %r53; - cvt.u16.u32 %rs4, %r54; - cvt.rn.f32.u16 %f1, %rs4; + add.s32 %r42, %r11, %r166; + mul.wide.u32 %rd14, %r42, 4; + add.s64 %rd15, %rd2, %rd14; + add.s32 %r43, %r15, %r166; + mul.wide.u32 %rd16, %r43, 3; + add.s64 %rd17, %rd1, %rd16; + ld.global.u8 %r44, [%rd17]; + ld.global.u8 %r45, [%rd15]; + sub.s32 %r46, %r44, %r45; + abs.s32 %r47, %r46; + ld.global.u8 %r48, [%rd17+1]; + ld.global.u8 %r49, [%rd15+1]; + sub.s32 %r50, %r48, %r49; + abs.s32 %r51, %r50; + add.s32 %r52, %r51, %r47; + ld.global.u8 %r53, [%rd17+2]; + ld.global.u8 %r54, [%rd15+2]; + sub.s32 %r55, %r53, %r54; + abs.s32 %r56, %r55; + add.s32 %r57, %r52, %r56; + cvt.u16.u32 %rs6, %r57; + cvt.rn.f32.u16 %f1, %rs6; cvt.f64.f32 %fd1, %f1; - ld.global.u8 %rs5, [%rd14+3]; - cvt.rn.f32.u16 %f2, %rs5; + ld.global.u8 %rs7, [%rd15+3]; + cvt.rn.f32.u16 %f2, %rs7; cvt.f64.f32 %fd2, %f2; div.rn.f64 %fd3, %fd2, 0d406FE00000000000; mul.f64 %fd4, %fd3, %fd1; - cvt.rzi.u32.f64 %r55, %fd4; - cvt.u16.u32 %rs6, %r55; - add.s16 %rs7, %rs16, %rs6; - st.global.u16 [%rd3+4], %rs7; - mul.wide.u32 %rd17, %r95, 4; - add.s64 %rd18, %rd2, %rd17; - mul.wide.u32 %rd19, %r93, 3; - add.s64 %rd20, %rd1, %rd19; - ld.global.u8 %r56, [%rd20]; - ld.global.u8 %r57, [%rd18]; - sub.s32 %r58, %r56, %r57; - abs.s32 %r59, %r58; - ld.global.u8 %r60, [%rd20+1]; - ld.global.u8 %r61, [%rd18+1]; - sub.s32 %r62, %r60, %r61; - abs.s32 %r63, %r62; - add.s32 %r64, %r63, %r59; - ld.global.u8 %r65, [%rd20+2]; - ld.global.u8 %r66, [%rd18+2]; + cvt.rzi.u32.f64 %r58, %fd4; + cvt.u16.u32 %rs8, %r58; + add.s16 %rs9, %rs32, %rs8; + st.global.u16 [%rd3+4], %rs9; + add.s32 %r59, %r17, %r166; + mul.wide.u32 %rd18, %r59, 4; + add.s64 %rd19, %rd2, %rd18; + add.s32 %r60, %r16, %r166; + mul.wide.u32 %rd20, %r60, 3; + add.s64 %rd21, %rd1, %rd20; + ld.global.u8 %r61, [%rd21]; + ld.global.u8 %r62, [%rd19]; + sub.s32 %r63, %r61, %r62; + abs.s32 %r64, %r63; + ld.global.u8 %r65, [%rd21+1]; + ld.global.u8 %r66, [%rd19+1]; sub.s32 %r67, %r65, %r66; abs.s32 %r68, %r67; - add.s32 %r69, %r64, %r68; - cvt.u16.u32 %rs8, %r69; - cvt.rn.f32.u16 %f3, %rs8; + add.s32 %r69, %r68, %r64; + ld.global.u8 %r70, [%rd21+2]; + ld.global.u8 %r71, [%rd19+2]; + sub.s32 %r72, %r70, %r71; + abs.s32 %r73, %r72; + add.s32 %r74, %r69, %r73; + cvt.u16.u32 %rs10, %r74; + cvt.rn.f32.u16 %f3, %rs10; cvt.f64.f32 %fd5, %f3; - ld.global.u8 %rs9, [%rd18+3]; - cvt.rn.f32.u16 %f4, %rs9; + ld.global.u8 %rs11, [%rd19+3]; + cvt.rn.f32.u16 %f4, %rs11; cvt.f64.f32 %fd6, %f4; div.rn.f64 %fd7, %fd6, 0d406FE00000000000; mul.f64 %fd8, %fd7, %fd5; - cvt.rzi.u32.f64 %r70, %fd8; - cvt.u16.u32 %rs10, %r70; - add.s16 %rs16, %rs7, %rs10; - st.global.u16 [%rd3+4], %rs16; - add.s32 %r95, %r95, %r5; - add.s32 %r94, %r94, %r5; - add.s32 %r93, %r93, %r7; - add.s32 %r92, %r92, %r7; - add.s32 %r97, %r97, 2; - add.s32 %r71, %r8, %r97; - setp.ne.s32 %p6, %r71, 0; - @%p6 bra $L__BB7_5; - -$L__BB7_6: - @%p7 bra $L__BB7_8; - - mad.lo.s32 %r72, %r97, %r28, %r91; - mul.wide.u32 %rd21, %r72, 4; - add.s64 %rd22, %rd2, %rd21; - add.s32 %r73, %r97, %r9; - add.s32 %r74, %r91, %r10; - mad.lo.s32 %r75, %r73, %r27, %r74; - mul.wide.u32 %rd23, %r75, 3; - add.s64 %rd24, %rd1, %rd23; - ld.global.u8 %r76, [%rd24]; - ld.global.u8 %r77, [%rd22]; - sub.s32 %r78, %r76, %r77; - abs.s32 %r79, %r78; - ld.global.u8 %r80, [%rd24+1]; - ld.global.u8 %r81, [%rd22+1]; - sub.s32 %r82, %r80, %r81; - abs.s32 %r83, %r82; - add.s32 %r84, %r83, %r79; - ld.global.u8 %r85, [%rd24+2]; - ld.global.u8 %r86, [%rd22+2]; - sub.s32 %r87, %r85, %r86; - abs.s32 %r88, %r87; - add.s32 %r89, %r84, %r88; - cvt.u16.u32 %rs11, %r89; - cvt.rn.f32.u16 %f5, %rs11; + cvt.rzi.u32.f64 %r75, %fd8; + cvt.u16.u32 %rs12, %r75; + add.s16 %rs13, %rs9, %rs12; + st.global.u16 [%rd3+4], %rs13; + add.s32 %r76, %r163, -1; + mul.wide.u32 %rd22, %r76, 4; + add.s64 %rd23, %rd2, %rd22; + add.s32 %r77, %r164, -1; + mul.wide.u32 %rd24, %r77, 3; + add.s64 %rd25, %rd1, %rd24; + ld.global.u8 %r78, [%rd25]; + ld.global.u8 %r79, [%rd23]; + sub.s32 %r80, %r78, %r79; + abs.s32 %r81, %r80; + ld.global.u8 %r82, [%rd25+1]; + ld.global.u8 %r83, [%rd23+1]; + sub.s32 %r84, %r82, %r83; + abs.s32 %r85, %r84; + add.s32 %r86, %r85, %r81; + ld.global.u8 %r87, [%rd25+2]; + ld.global.u8 %r88, [%rd23+2]; + sub.s32 %r89, %r87, %r88; + abs.s32 %r90, %r89; + add.s32 %r91, %r86, %r90; + cvt.u16.u32 %rs14, %r91; + cvt.rn.f32.u16 %f5, %rs14; cvt.f64.f32 %fd9, %f5; - ld.global.u8 %rs12, [%rd22+3]; - cvt.rn.f32.u16 %f6, %rs12; + ld.global.u8 %rs15, [%rd23+3]; + cvt.rn.f32.u16 %f6, %rs15; cvt.f64.f32 %fd10, %f6; div.rn.f64 %fd11, %fd10, 0d406FE00000000000; mul.f64 %fd12, %fd11, %fd9; - cvt.rzi.u32.f64 %r90, %fd12; - cvt.u16.u32 %rs13, %r90; - ld.global.u16 %rs14, [%rd3+4]; - add.s16 %rs15, %rs14, %rs13; - st.global.u16 [%rd3+4], %rs15; - -$L__BB7_8: - add.s32 %r91, %r91, 1; - setp.lt.u32 %p8, %r91, %r28; - @%p8 bra $L__BB7_3; - -$L__BB7_9: + cvt.rzi.u32.f64 %r92, %fd12; + cvt.u16.u32 %rs16, %r92; + add.s16 %rs17, %rs13, %rs16; + st.global.u16 [%rd3+4], %rs17; + mul.wide.u32 %rd26, %r163, 4; + add.s64 %rd27, %rd2, %rd26; + mul.wide.u32 %rd28, %r164, 3; + add.s64 %rd29, %rd1, %rd28; + ld.global.u8 %r93, [%rd29]; + ld.global.u8 %r94, [%rd27]; + sub.s32 %r95, %r93, %r94; + abs.s32 %r96, %r95; + ld.global.u8 %r97, [%rd29+1]; + ld.global.u8 %r98, [%rd27+1]; + sub.s32 %r99, %r97, %r98; + abs.s32 %r100, %r99; + add.s32 %r101, %r100, %r96; + ld.global.u8 %r102, [%rd29+2]; + ld.global.u8 %r103, [%rd27+2]; + sub.s32 %r104, %r102, %r103; + abs.s32 %r105, %r104; + add.s32 %r106, %r101, %r105; + cvt.u16.u32 %rs18, %r106; + cvt.rn.f32.u16 %f7, %rs18; + cvt.f64.f32 %fd13, %f7; + ld.global.u8 %rs19, [%rd27+3]; + cvt.rn.f32.u16 %f8, %rs19; + cvt.f64.f32 %fd14, %f8; + div.rn.f64 %fd15, %fd14, 0d406FE00000000000; + mul.f64 %fd16, %fd15, %fd13; + cvt.rzi.u32.f64 %r107, %fd16; + cvt.u16.u32 %rs20, %r107; + add.s16 %rs32, %rs17, %rs20; + st.global.u16 [%rd3+4], %rs32; + add.s32 %r164, %r164, 4; + add.s32 %r163, %r163, 4; + add.s32 %r166, %r166, 4; + add.s32 %r108, %r7, %r166; + setp.ne.s32 %p6, %r108, 0; + @%p6 bra $L__BB7_5; + +$L__BB7_6: + @%p7 bra $L__BB7_10; + + add.s32 %r109, %r166, %r11; + mul.wide.u32 %rd30, %r109, 4; + add.s64 %rd31, %rd2, %rd30; + add.s32 %r110, %r12, %r166; + mul.wide.u32 %rd32, %r110, 3; + add.s64 %rd33, %rd1, %rd32; + ld.global.u8 %r111, [%rd33]; + ld.global.u8 %r112, [%rd31]; + sub.s32 %r113, %r111, %r112; + abs.s32 %r114, %r113; + ld.global.u8 %r115, [%rd33+1]; + ld.global.u8 %r116, [%rd31+1]; + sub.s32 %r117, %r115, %r116; + abs.s32 %r118, %r117; + add.s32 %r119, %r118, %r114; + ld.global.u8 %r120, [%rd33+2]; + ld.global.u8 %r121, [%rd31+2]; + sub.s32 %r122, %r120, %r121; + abs.s32 %r123, %r122; + add.s32 %r124, %r119, %r123; + cvt.u16.u32 %rs21, %r124; + cvt.rn.f32.u16 %f9, %rs21; + cvt.f64.f32 %fd17, %f9; + ld.global.u8 %rs22, [%rd31+3]; + cvt.rn.f32.u16 %f10, %rs22; + cvt.f64.f32 %fd18, %f10; + div.rn.f64 %fd19, %fd18, 0d406FE00000000000; + mul.f64 %fd20, %fd19, %fd17; + cvt.rzi.u32.f64 %r125, %fd20; + cvt.u16.u32 %rs23, %r125; + ld.global.u16 %rs24, [%rd3+4]; + add.s16 %rs4, %rs24, %rs23; + st.global.u16 [%rd3+4], %rs4; + @%p8 bra $L__BB7_10; + + add.s32 %r126, %r166, 1; + add.s32 %r127, %r126, %r11; + mul.wide.u32 %rd34, %r127, 4; + add.s64 %rd35, %rd2, %rd34; + add.s32 %r128, %r12, %r126; + mul.wide.u32 %rd36, %r128, 3; + add.s64 %rd37, %rd1, %rd36; + ld.global.u8 %r129, [%rd37]; + ld.global.u8 %r130, [%rd35]; + sub.s32 %r131, %r129, %r130; + abs.s32 %r132, %r131; + ld.global.u8 %r133, [%rd37+1]; + ld.global.u8 %r134, [%rd35+1]; + sub.s32 %r135, %r133, %r134; + abs.s32 %r136, %r135; + add.s32 %r137, %r136, %r132; + ld.global.u8 %r138, [%rd37+2]; + ld.global.u8 %r139, [%rd35+2]; + sub.s32 %r140, %r138, %r139; + abs.s32 %r141, %r140; + add.s32 %r142, %r137, %r141; + cvt.u16.u32 %rs25, %r142; + cvt.rn.f32.u16 %f11, %rs25; + cvt.f64.f32 %fd21, %f11; + ld.global.u8 %rs26, [%rd35+3]; + cvt.rn.f32.u16 %f12, %rs26; + cvt.f64.f32 %fd22, %f12; + div.rn.f64 %fd23, %fd22, 0d406FE00000000000; + mul.f64 %fd24, %fd23, %fd21; + cvt.rzi.u32.f64 %r143, %fd24; + cvt.u16.u32 %rs27, %r143; + add.s16 %rs5, %rs4, %rs27; + st.global.u16 [%rd3+4], %rs5; + @%p9 bra $L__BB7_10; + + add.s32 %r144, %r166, 2; + add.s32 %r145, %r144, %r11; + mul.wide.u32 %rd38, %r145, 4; + add.s64 %rd39, %rd2, %rd38; + add.s32 %r146, %r12, %r144; + mul.wide.u32 %rd40, %r146, 3; + add.s64 %rd41, %rd1, %rd40; + ld.global.u8 %r147, [%rd41]; + ld.global.u8 %r148, [%rd39]; + sub.s32 %r149, %r147, %r148; + abs.s32 %r150, %r149; + ld.global.u8 %r151, [%rd41+1]; + ld.global.u8 %r152, [%rd39+1]; + sub.s32 %r153, %r151, %r152; + abs.s32 %r154, %r153; + add.s32 %r155, %r154, %r150; + ld.global.u8 %r156, [%rd41+2]; + ld.global.u8 %r157, [%rd39+2]; + sub.s32 %r158, %r156, %r157; + abs.s32 %r159, %r158; + add.s32 %r160, %r155, %r159; + cvt.u16.u32 %rs28, %r160; + cvt.rn.f32.u16 %f13, %rs28; + cvt.f64.f32 %fd25, %f13; + ld.global.u8 %rs29, [%rd39+3]; + cvt.rn.f32.u16 %f14, %rs29; + cvt.f64.f32 %fd26, %f14; + div.rn.f64 %fd27, %fd26, 0d406FE00000000000; + mul.f64 %fd28, %fd27, %fd25; + cvt.rzi.u32.f64 %r161, %fd28; + cvt.u16.u32 %rs30, %r161; + add.s16 %rs31, %rs5, %rs30; + st.global.u16 [%rd3+4], %rs31; + +$L__BB7_10: + add.s32 %r162, %r162, 1; + setp.lt.u32 %p10, %r162, %r27; + @%p10 bra $L__BB7_3; + +$L__BB7_11: ret; }