gpgpu.bib

@article{Hong:2009:AMG:1555815.1555775,
 author = {Hong, Sunpyo and Kim, Hyesoon},
 title = {An Analytical Model for a GPU Architecture with Memory-level and Thread-level Parallelism Awareness},
 journal = {SIGARCH Comput. Archit. News},
 issue_date = {June 2009},
 volume = {37},
 number = {3},
 month = jun,
 year = {2009},
 issn = {0163-5964},
 pages = {152--163},
 numpages = {12},
 url = {http://doi.acm.org/10.1145/1555815.1555775},
 doi = {10.1145/1555815.1555775},
 acmid = {1555775},
 publisher = {ACM},
 address = {New York, NY, USA},
 keywords = {GPU architecture, analytical model, cuda, memory level parallelism, performance estimation, warp level parallelism},
}

@inproceedings{Hong:2009:AMG:1555754.1555775,
 author = {Hong, Sunpyo and Kim, Hyesoon},
 title = {An Analytical Model for a GPU Architecture with Memory-level and Thread-level Parallelism Awareness},
 booktitle = {Proceedings of the 36th Annual International Symposium on Computer Architecture},
 series = {ISCA '09},
 year = {2009},
 isbn = {978-1-60558-526-0},
 location = {Austin, TX, USA},
 pages = {152--163},
 numpages = {12},
 url = {http://doi.acm.org/10.1145/1555754.1555775},
 doi = {10.1145/1555754.1555775},
 acmid = {1555775},
 publisher = {ACM},
 address = {New York, NY, USA},
 keywords = {GPU architecture, analytical model, cuda, memory level parallelism, performance estimation, warp level parallelism},
}

@inproceedings{Stuart:2010:GC:2031978.2032028,
 author = {Stuart, Jeff A. and Cox, Michael and Owens, John D.},
 title = {GPU-to-CPU Callbacks},
 booktitle = {Proceedings of the 2010 Conference on Parallel Processing},
 series = {Euro-Par 2010},
 year = {2011},
 isbn = {978-3-642-21877-4},
 location = {Ischia, Italy},
 pages = {365--372},
 numpages = {8},
 url = {http://dl.acm.org/citation.cfm?id=2031978.2032028},
 acmid = {2032028},
 publisher = {Springer-Verlag},
 address = {Berlin, Heidelberg},
}

@inproceedings{Fatahalian:2008:GCL:1401132.1401147,
 author = {Fatahalian, Kayvon and Houston, Mike},
 title = {GPUs a Closer Look},
 booktitle = {ACM SIGGRAPH 2008 Classes},
 series = {SIGGRAPH '08},
 year = {2008},
 location = {Los Angeles, California},
 pages = {11:1--11:11},
 articleno = {11},
 numpages = {11},
 url = {http://doi.acm.org/10.1145/1401132.1401147},
 doi = {10.1145/1401132.1401147},
 acmid = {1401147},
 publisher = {ACM},
 address = {New York, NY, USA},
}

@inproceedings{Buck:2004:BGS:1186562.1015800,
 author = {Buck, Ian and Foley, Tim and Horn, Daniel and Sugerman, Jeremy and Fatahalian, Kayvon and Houston, Mike and Hanrahan, Pat},
 title = {Brook for GPUs: Stream Computing on Graphics Hardware},
 booktitle = {ACM SIGGRAPH 2004 Papers},
 series = {SIGGRAPH '04},
 year = {2004},
 location = {Los Angeles, California},
 pages = {777--786},
 numpages = {10},
 url = {http://doi.acm.org/10.1145/1186562.1015800},
 doi = {10.1145/1186562.1015800},
 acmid = {1015800},
 publisher = {ACM},
 address = {New York, NY, USA},
 keywords = {Data Parallel Computing, GPU Computing, Brook, Programmable Graphics Hardware, Stream Computing},
}

@article{Buck:2004:BGS:1015706.1015800,
 author = {Buck, Ian and Foley, Tim and Horn, Daniel and Sugerman, Jeremy and Fatahalian, Kayvon and Houston, Mike and Hanrahan, Pat},
 title = {Brook for GPUs: Stream Computing on Graphics Hardware},
 journal = {ACM Trans. Graph.},
 issue_date = {August 2004},
 volume = {23},
 number = {3},
 month = aug,
 year = {2004},
 issn = {0730-0301},
 pages = {777--786},
 numpages = {10},
 url = {http://doi.acm.org/10.1145/1015706.1015800},
 doi = {10.1145/1015706.1015800},
 acmid = {1015800},
 publisher = {ACM},
 address = {New York, NY, USA},
 keywords = {Data Parallel Computing, GPU Computing, Brook, Programmable Graphics Hardware, Stream Computing},
}

@Comment Not an actual paper
@Comment http://lorenabarba.com/gpuatbu/Talks__Luebke.html
@inproceedings{Luebke:2009:GHG:1555880.1555888,
 author = {Luebke, David},
 title = {Graphics Hardware \& GPU Computing: Past, Present, and Future},
 booktitle = {Proceedings of Graphics Interface 2009},
 series = {GI '09},
 year = {2009},
 isbn = {978-1-56881-470-4},
 location = {Kelowna, British Columbia, Canada},
 pages = {6:1--6:1},
 articleno = {6},
 numpages = {1},
 url = {http://dl.acm.org/citation.cfm?id=1555880.1555888},
 acmid = {1555888},
 publisher = {Canadian Information Processing Society},
 address = {Toronto, Ont., Canada, Canada},
}

@article{Nickolls:2010:GCE:1803935.1804055,
 author = {Nickolls, John and Dally, William J.},
 title = {The GPU Computing Era},
 journal = {IEEE Micro},
 issue_date = {March 2010},
 volume = {30},
 number = {2},
 month = mar,
 year = {2010},
 issn = {0272-1732},
 pages = {56--69},
 numpages = {14},
 url = {http://dx.doi.org/10.1109/MM.2010.41},
 doi = {10.1109/MM.2010.41},
 acmid = {1804055},
 publisher = {IEEE Computer Society Press},
 address = {Los Alamitos, CA, USA},
 keywords = {CUDA, Fermi GPU architecture, GPU computing, GPU computing, CUDA, scalable parallel computing, heterogeneous CPU\&\#x002B, GPU coprocessing, GPU coprocessing, Tesla GPU architecture, Fermi GPU architecture, NVIDIA., NVIDIA., Tesla GPU architecture, heterogeneous CPU\&\#x002B, scalable parallel computing},
}

@article{Lee:2010:DGV:1816038.1816021,
 author = {Lee, Victor W. and Kim, Changkyu and Chhugani, Jatin and Deisher, Michael and Kim, Daehyun and Nguyen, Anthony D. and Satish, Nadathur and Smelyanskiy, Mikhail and Chennupaty, Srinivas and Hammarlund, Per and Singhal, Ronak and Dubey, Pradeep},
 title = {Debunking the 100X GPU vs. CPU Myth: An Evaluation of Throughput Computing on CPU and GPU},
 journal = {SIGARCH Comput. Archit. News},
 issue_date = {June 2010},
 volume = {38},
 number = {3},
 month = jun,
 year = {2010},
 issn = {0163-5964},
 pages = {451--460},
 numpages = {10},
 url = {http://doi.acm.org/10.1145/1816038.1816021},
 doi = {10.1145/1816038.1816021},
 acmid = {1816021},
 publisher = {ACM},
 address = {New York, NY, USA},
 keywords = {cpu architecture, gpu architecture, performance analysis, performance measurement, software optimization, throughput computing},
}

@inproceedings{Lee:2010:DGV:1815961.1816021,
 author = {Lee, Victor W. and Kim, Changkyu and Chhugani, Jatin and Deisher, Michael and Kim, Daehyun and Nguyen, Anthony D. and Satish, Nadathur and Smelyanskiy, Mikhail and Chennupaty, Srinivas and Hammarlund, Per and Singhal, Ronak and Dubey, Pradeep},
 title = {Debunking the 100X GPU vs. CPU Myth: An Evaluation of Throughput Computing on CPU and GPU},
 booktitle = {Proceedings of the 37th Annual International Symposium on Computer Architecture},
 series = {ISCA '10},
 year = {2010},
 isbn = {978-1-4503-0053-7},
 location = {Saint-Malo, France},
 pages = {451--460},
 numpages = {10},
 url = {http://doi.acm.org/10.1145/1815961.1816021},
 doi = {10.1145/1815961.1816021},
 acmid = {1816021},
 publisher = {ACM},
 address = {New York, NY, USA},
 keywords = {cpu architecture, gpu architecture, performance analysis, performance measurement, software optimization, throughput computing},
}

@article{mcclanahan2010history,
  title={History and Evolution of GPU Architecture},
  author={McClanahan, Chris},
  journal={A Survey Paper},
  year={2010}
}

@inproceedings{owens2007survey,
  title={A Survey of general-purpose computation on graphics hardware},
  author={Owens, John D and Luebke, David and Govindaraju, Naga and Harris, Mark and Kr{\"u}ger, Jens and Lefohn, Aaron E and Purcell, Timothy J},
  booktitle={Computer graphics forum},
  volume={26},
  number={1},
  pages={80--113},
  year={2007},
  organization={Wiley Online Library}
}

@article{Yang:2010:GCM:1809028.1806606,
 author = {Yang, Yi and Xiang, Ping and Kong, Jingfei and Zhou, Huiyang},
 title = {A GPGPU Compiler for Memory Optimization and Parallelism Management},
 journal = {SIGPLAN Not.},
 issue_date = {June 2010},
 volume = {45},
 number = {6},
 month = jun,
 year = {2010},
 issn = {0362-1340},
 pages = {86--97},
 numpages = {12},
 url = {http://doi.acm.org/10.1145/1809028.1806606},
 doi = {10.1145/1809028.1806606},
 acmid = {1806606},
 publisher = {ACM},
 address = {New York, NY, USA},
 keywords = {compiler, gpgpu},
}

@inproceedings{Yang:2010:GCM:1806596.1806606,
 author = {Yang, Yi and Xiang, Ping and Kong, Jingfei and Zhou, Huiyang},
 title = {A GPGPU Compiler for Memory Optimization and Parallelism Management},
 booktitle = {Proceedings of the 31st ACM SIGPLAN Conference on Programming Language Design and Implementation},
 series = {PLDI '10},
 year = {2010},
 isbn = {978-1-4503-0019-3},
 location = {Toronto, Ontario, Canada},
 pages = {86--97},
 numpages = {12},
 url = {http://doi.acm.org/10.1145/1806596.1806606},
 doi = {10.1145/1806596.1806606},
 acmid = {1806606},
 publisher = {ACM},
 address = {New York, NY, USA},
 keywords = {compiler, gpgpu},
}

@inproceedings{Saha:2009:PMH:1542476.1542525,
 author = {Saha, Bratin and Zhou, Xiaocheng and Chen, Hu and Gao, Ying and Yan, Shoumeng and Rajagopalan, Mohan and Fang, Jesse and Zhang, Peinan and Ronen, Ronny and Mendelson, Avi},
 title = {Programming Model for a Heterogeneous x86 Platform},
 booktitle = {Proceedings of the 30th ACM SIGPLAN Conference on Programming Language Design and Implementation},
 series = {PLDI '09},
 year = {2009},
 isbn = {978-1-60558-392-1},
 location = {Dublin, Ireland},
 pages = {431--440},
 numpages = {10},
 url = {http://doi.acm.org/10.1145/1542476.1542525},
 doi = {10.1145/1542476.1542525},
 acmid = {1542525},
 publisher = {ACM},
 address = {New York, NY, USA},
 keywords = {heterogeneous platforms, programming model},
}

@article{Saha:2009:PMH:1543135.1542525,
 author = {Saha, Bratin and Zhou, Xiaocheng and Chen, Hu and Gao, Ying and Yan, Shoumeng and Rajagopalan, Mohan and Fang, Jesse and Zhang, Peinan and Ronen, Ronny and Mendelson, Avi},
 title = {Programming Model for a Heterogeneous x86 Platform},
 journal = {SIGPLAN Not.},
 issue_date = {June 2009},
 volume = {44},
 number = {6},
 month = jun,
 year = {2009},
 issn = {0362-1340},
 pages = {431--440},
 numpages = {10},
 url = {http://doi.acm.org/10.1145/1543135.1542525},
 doi = {10.1145/1543135.1542525},
 acmid = {1542525},
 publisher = {ACM},
 address = {New York, NY, USA},
 keywords = {heterogeneous platforms, programming model},
}

@article{lindholm2008nvidia,
  title={NVIDIA Tesla: A unified graphics and computing architecture},
  author={Lindholm, Erik and Nickolls, John and Oberman, Stuart and Montrym, John},
  journal={IEEE micro},
  number={2},
  pages={39--55},
  year={2008},
  publisher={IEEE}
}

@inproceedings{Peercy:2006:PDP:1179849.1180079,
 author = {Peercy, Mark and Segal, Mark and Gerstmann, Derek},
 title = {A Performance-oriented Data Parallel Virtual Machine for GPUs},
 booktitle = {ACM SIGGRAPH 2006 Sketches},
 series = {SIGGRAPH '06},
 year = {2006},
 isbn = {1-59593-364-6},
 location = {Boston, Massachusetts},
 articleno = {184},
 url = {http://doi.acm.org/10.1145/1179849.1180079},
 doi = {10.1145/1179849.1180079},
 acmid = {1180079},
 publisher = {ACM},
 address = {New York, NY, USA},
}

@techreport{Catanzaro:EECS-2010-124,
    Author = {Catanzaro, Bryan and Garland, Michael and Keutzer, Kurt},
    Title = {Copperhead: Compiling an Embedded Data Parallel Language},
    Institution = {EECS Department, University of California, Berkeley},
    Year = {2010},
    Month = {Sep},
    URL = {http://www.eecs.berkeley.edu/Pubs/TechRpts/2010/EECS-2010-124.html},
    Number = {UCB/EECS-2010-124},
    Abstract = {Modern parallel microprocessors deliver high performance on applications that expose substantial fine-grained data parallelism. Although data parallelism is widely available in many computations, implementing data parallel algorithms in low-level languages is often an unnecessarily difficult task. The characteristics of parallel microprocessors and the limitations of current programming methodologies motivate our design of Copperhead, a high-level data parallel language embedded in Python. The Copperhead programmer describes parallel computations via composition of familiar data parallel primitives supporting both flat and nested data parallel computation on arrays of data. Copperhead programs are expressed in a subset of the widely used Python programming language and interoperate with standard Python modules, including libraries for numeric computation, data visualization, and analysis.
In this paper, we discuss the language, compiler, and runtime features that enable Copperhead to efficiently execute data parallel code. We define the restricted subset of Python which Copperhead supports and introduce the program analysis techniques necessary for compiling Copperhead code into efficient low-level implementations. We also outline the runtime support by which Copperhead programs interoperate with standard Python modules. We demonstrate the effectiveness of our techniques with several examples targeting the CUDA platform for parallel programming on GPUs. Copperhead code is concise, on average requiring 3.6 times fewer lines of code than CUDA, and the compiler generates efficient code, yielding 45-100% of the performance of hand-crafted, well optimized CUDA code.}
}

@inproceedings{Chakravarty:2011:AHA:1926354.1926358,
 author = {Chakravarty, Manuel M.T. and Keller, Gabriele and Lee, Sean and McDonell, Trevor L. and Grover, Vinod},
 title = {Accelerating Haskell Array Codes with Multicore GPUs},
 booktitle = {Proceedings of the Sixth Workshop on Declarative Aspects of Multicore Programming},
 series = {DAMP '11},
 year = {2011},
 isbn = {978-1-4503-0486-3},
 location = {Austin, Texas, USA},
 pages = {3--14},
 numpages = {12},
 url = {http://doi.acm.org/10.1145/1926354.1926358},
 doi = {10.1145/1926354.1926358},
 acmid = {1926358},
 publisher = {ACM},
 address = {New York, NY, USA},
 keywords = {arrays, data parallelism, dynamic compilation, gpgpu, haskell, skeletons},
}

@inproceedings{McDonell:2013:OPF:2500365.2500595,
 author = {McDonell, Trevor L. and Chakravarty, Manuel M.T. and Keller, Gabriele and Lippmeier, Ben},
 title = {Optimising Purely Functional GPU Programs},
 booktitle = {Proceedings of the 18th ACM SIGPLAN International Conference on Functional Programming},
 series = {ICFP '13},
 year = {2013},
 isbn = {978-1-4503-2326-0},
 location = {Boston, Massachusetts, USA},
 pages = {49--60},
 numpages = {12},
 url = {http://doi.acm.org/10.1145/2500365.2500595},
 doi = {10.1145/2500365.2500595},
 acmid = {2500595},
 publisher = {ACM},
 address = {New York, NY, USA},
 keywords = {array fusion, arrays, data parallelism, dynamic compilation, embedded language, gpgpu, haskell, sharing recovery},
}

@article{McDonell:2013:OPF:2544174.2500595,
 author = {McDonell, Trevor L. and Chakravarty, Manuel M.T. and Keller, Gabriele and Lippmeier, Ben},
 title = {Optimising Purely Functional GPU Programs},
 journal = {SIGPLAN Not.},
 issue_date = {September 2013},
 volume = {48},
 number = {9},
 month = sep,
 year = {2013},
 issn = {0362-1340},
 pages = {49--60},
 numpages = {12},
 url = {http://doi.acm.org/10.1145/2544174.2500595},
 doi = {10.1145/2544174.2500595},
 acmid = {2500595},
 publisher = {ACM},
 address = {New York, NY, USA},
 keywords = {array fusion, arrays, data parallelism, dynamic compilation, embedded language, gpgpu, haskell, sharing recovery},
}

@inproceedings{Elliott:2004:PGP:1017472.1017482,
 author = {Elliott, Conal},
 title = {Programming Graphics Processors Functionally},
 booktitle = {Proceedings of the 2004 ACM SIGPLAN Workshop on Haskell},
 series = {Haskell '04},
 year = {2004},
 isbn = {1-58113-850-4},
 location = {Snowbird, Utah, USA},
 pages = {45--56},
 numpages = {12},
 url = {http://doi.acm.org/10.1145/1017472.1017482},
 doi = {10.1145/1017472.1017482},
 acmid = {1017482},
 publisher = {ACM},
 address = {New York, NY, USA},
 keywords = {3D modeling, code generation, compilers, computer algebra, computer graphics, domain-specific languages, functional geometry, functional programming, graphics languages, graphics processors, partial evaluation, procedural geometry, procedural shading, shading languages},
}

@article{Mainland:2010:NEC:2088456.1863533,
 author = {Mainland, Geoffrey and Morrisett, Greg},
 title = {Nikola: Embedding Compiled GPU Functions in Haskell},
 journal = {SIGPLAN Not.},
 issue_date = {November 2010},
 volume = {45},
 number = {11},
 month = sep,
 year = {2010},
 issn = {0362-1340},
 pages = {67--78},
 numpages = {12},
 url = {http://doi.acm.org/10.1145/2088456.1863533},
 doi = {10.1145/2088456.1863533},
 acmid = {1863533},
 publisher = {ACM},
 address = {New York, NY, USA},
 keywords = {cuda, gpu, meta programming},
}

@inproceedings{Mainland:2010:NEC:1863523.1863533,
 author = {Mainland, Geoffrey and Morrisett, Greg},
 title = {Nikola: Embedding Compiled GPU Functions in Haskell},
 booktitle = {Proceedings of the Third ACM Haskell Symposium on Haskell},
 series = {Haskell '10},
 year = {2010},
 isbn = {978-1-4503-0252-4},
 location = {Baltimore, Maryland, USA},
 pages = {67--78},
 numpages = {12},
 url = {http://doi.acm.org/10.1145/1863523.1863533},
 doi = {10.1145/1863523.1863533},
 acmid = {1863533},
 publisher = {ACM},
 address = {New York, NY, USA},
 keywords = {cuda, gpu, meta programming},
}

@article{Bergstrom:2012:NDG:2398856.2364563,
 author = {Bergstrom, Lars and Reppy, John},
 title = {Nested Data-parallelism on the GPU},
 journal = {SIGPLAN Not.},
 issue_date = {September 2012},
 volume = {47},
 number = {9},
 month = sep,
 year = {2012},
 issn = {0362-1340},
 pages = {247--258},
 numpages = {12},
 url = {http://doi.acm.org/10.1145/2398856.2364563},
 doi = {10.1145/2398856.2364563},
 acmid = {2364563},
 publisher = {ACM},
 address = {New York, NY, USA},
 keywords = {gpgpu, gpu, nesl, nested data parallelism},
}

@inproceedings{Bergstrom:2012:NDG:2364527.2364563,
 author = {Bergstrom, Lars and Reppy, John},
 title = {Nested Data-parallelism on the Gpu},
 booktitle = {Proceedings of the 17th ACM SIGPLAN International Conference on Functional Programming},
 series = {ICFP '12},
 year = {2012},
 isbn = {978-1-4503-1054-3},
 location = {Copenhagen, Denmark},
 pages = {247--258},
 numpages = {12},
 url = {http://doi.acm.org/10.1145/2364527.2364563},
 doi = {10.1145/2364527.2364563},
 acmid = {2364563},
 publisher = {ACM},
 address = {New York, NY, USA},
 keywords = {gpgpu, gpu, nesl, nested data parallelism},
}

@inproceedings{Claessen:2012:EAC:2103736.2103740,
 author = {Claessen, Koen and Sheeran, Mary and Svensson, Bo Joel},
 title = {Expressive Array Constructs in an Embedded GPU Kernel Programming Language},
 booktitle = {Proceedings of the 7th Workshop on Declarative Aspects and Applications of Multicore Programming},
 series = {DAMP '12},
 year = {2012},
 isbn = {978-1-4503-1117-5},
 location = {Philadelphia, Pennsylvania, USA},
 pages = {21--30},
 numpages = {10},
 url = {http://doi.acm.org/10.1145/2103736.2103740},
 doi = {10.1145/2103736.2103740},
 acmid = {2103740},
 publisher = {ACM},
 address = {New York, NY, USA},
 keywords = {arrays, data parallelism, embedded domain specific language, general purpose gpu programming, haskell},
}

@inproceedings{Gaster:2013:FAS:2458523.2458527,
 author = {Gaster, Benedict R. and Howes, Lee},
 title = {Formalizing Address Spaces with Application to Cuda, OpenCL, and Beyond},
 booktitle = {Proceedings of the 6th Workshop on General Purpose Processor Using Graphics Processing Units},
 series = {GPGPU-6},
 year = {2013},
 isbn = {978-1-4503-2017-7},
 location = {Houston, Texas, USA},
 pages = {32--41},
 numpages = {10},
 url = {http://doi.acm.org/10.1145/2458523.2458527},
 doi = {10.1145/2458523.2458527},
 acmid = {2458527},
 publisher = {ACM},
 address = {New York, NY, USA},
 keywords = {C++, GPGPU, GPU, OpenCL},
}

@inproceedings{Hower:2014:HMM:2541940.2541981,
 author = {Hower, Derek R. and Hechtman, Blake A. and Beckmann, Bradford M. and Gaster, Benedict R. and Hill, Mark D. and Reinhardt, Steven K. and Wood, David A.},
 title = {Heterogeneous-race-free Memory Models},
 booktitle = {Proceedings of the 19th International Conference on Architectural Support for Programming Languages and Operating Systems},
 series = {ASPLOS '14},
 year = {2014},
 isbn = {978-1-4503-2305-5},
 location = {Salt Lake City, Utah, USA},
 pages = {427--440},
 numpages = {14},
 url = {http://doi.acm.org/10.1145/2541940.2541981},
 doi = {10.1145/2541940.2541981},
 acmid = {2541981},
 publisher = {ACM},
 address = {New York, NY, USA},
 keywords = {data-race-free, heterogeneous systems, memory consistency model, task runtime},
}

@inproceedings{Taft:2011:MMC:2048147.2048192,
 author = {Taft, S. Tucker and Bloch, Joshua and Bocchino, Robert and Burckhardt, Sebastian and Chafi, Hassan and Cox, Russ and Gaster, Benedict and Steele, Guy and Ungar, David},
 title = {Multicore, Manycore, and Cloud Computing: Is a New Programming Language Paradigm Required?},
 booktitle = {Proceedings of the ACM International Conference Companion on Object Oriented Programming Systems Languages and Applications Companion},
 series = {OOPSLA '11},
 year = {2011},
 isbn = {978-1-4503-0942-4},
 location = {Portland, Oregon, USA},
 pages = {165--170},
 numpages = {6},
 url = {http://doi.acm.org/10.1145/2048147.2048192},
 doi = {10.1145/2048147.2048192},
 acmid = {2048192},
 publisher = {ACM},
 address = {New York, NY, USA},
 keywords = {cloud computing, manycore programming, multicore programming, new programming paradigms},
}

@inproceedings{Alglave:2015:GCW:2694344.2694391,
 author = {Alglave, Jade and Batty, Mark and Donaldson, Alastair F. and Gopalakrishnan, Ganesh and Ketema, Jeroen and Poetzl, Daniel and Sorensen, Tyler and Wickerson, John},
 title = {GPU Concurrency: Weak Behaviours and Programming Assumptions},
 booktitle = {Proceedings of the Twentieth International Conference on Architectural Support for Programming Languages and Operating Systems},
 series = {ASPLOS '15},
 year = {2015},
 isbn = {978-1-4503-2835-7},
 location = {Istanbul, Turkey},
 pages = {577--591},
 numpages = {15},
 url = {http://doi.acm.org/10.1145/2694344.2694391},
 doi = {10.1145/2694344.2694391},
 acmid = {2694391},
 publisher = {ACM},
 address = {New York, NY, USA},
 keywords = {GPU, Nvidia PTX, formal model, litmus testing, memory consistency, openCL, test generation},
}

@inproceedings{Sorensen:2013:TSM:2464996.2467280,
 author = {Sorensen, Tyler and Gopalakrishnan, Ganesh and Grover, Vinod},
 title = {Towards Shared Memory Consistency Models for GPUs},
 booktitle = {Proceedings of the 27th International ACM Conference on International Conference on Supercomputing},
 series = {ICS '13},
 year = {2013},
 isbn = {978-1-4503-2130-3},
 location = {Eugene, Oregon, USA},
 pages = {489--490},
 numpages = {2},
 url = {http://doi.acm.org/10.1145/2464996.2467280},
 doi = {10.1145/2464996.2467280},
 acmid = {2467280},
 publisher = {ACM},
 address = {New York, NY, USA},
 keywords = {GPU, memory fences, shared memory consistency},
}

@inproceedings{Gummaraju:2010:TPS:1854273.1854302,
 author = {Gummaraju, Jayanth and Morichetti, Laurent and Houston, Michael and Sander, Ben and Gaster, Benedict R. and Zheng, Bixia},
 title = {Twin Peaks: A Software Platform for Heterogeneous Computing on General-purpose and Graphics Processors},
 booktitle = {Proceedings of the 19th International Conference on Parallel Architectures and Compilation Techniques},
 series = {PACT '10},
 year = {2010},
 isbn = {978-1-4503-0178-7},
 location = {Vienna, Austria},
 pages = {205--216},
 numpages = {12},
 url = {http://doi.acm.org/10.1145/1854273.1854302},
 doi = {10.1145/1854273.1854302},
 acmid = {1854302},
 publisher = {ACM},
 address = {New York, NY, USA},
 keywords = {GPGPU, OpenCL, multicore, programmability, runtime},
}

@Comment TODO acquire paper
@inproceedings{Svensson:2008:ODS:2044476.2044485,
 author = {Svensson, Joel and Sheeran, Mary and Claessen, Koen},
 title = {Obsidian: A Domain Specific Embedded Language for Parallel Programming of Graphics Processors},
 booktitle = {Proceedings of the 20th International Conference on Implementation and Application of Functional Languages},
 series = {IFL'08},
 year = {2011},
 isbn = {978-3-642-24451-3},
 location = {Hatfield, UK},
 pages = {156--173},
 numpages = {18},
 url = {http://dl.acm.org/citation.cfm?id=2044476.2044485},
 acmid = {2044485},
 publisher = {Springer-Verlag},
 address = {Berlin, Heidelberg},
}

@inproceedings{Keller:2010:RSP:1863543.1863582,
 author = {Keller, Gabriele and Chakravarty, Manuel M.T. and Leshchinskiy, Roman and Peyton Jones, Simon and Lippmeier, Ben},
 title = {Regular, Shape-polymorphic, Parallel Arrays in Haskell},
 booktitle = {Proceedings of the 15th ACM SIGPLAN International Conference on Functional Programming},
 series = {ICFP '10},
 year = {2010},
 isbn = {978-1-60558-794-3},
 location = {Baltimore, Maryland, USA},
 pages = {261--272},
 numpages = {12},
 url = {http://doi.acm.org/10.1145/1863543.1863582},
 doi = {10.1145/1863543.1863582},
 acmid = {1863582},
 publisher = {ACM},
 address = {New York, NY, USA},
 keywords = {arrays, data parallelism, haskell},
}

@inproceedings{Foltzer:2012:MPC:2364527.2364562,
 author = {Foltzer, Adam and Kulkarni, Abhishek and Swords, Rebecca and Sasidharan, Sajith and Jiang, Eric and Newton, Ryan},
 title = {A Meta-scheduler for the Par-monad: Composable Scheduling for the Heterogeneous Cloud},
 booktitle = {Proceedings of the 17th ACM SIGPLAN International Conference on Functional Programming},
 series = {ICFP '12},
 year = {2012},
 isbn = {978-1-4503-1054-3},
 location = {Copenhagen, Denmark},
 pages = {235--246},
 numpages = {12},
 url = {http://doi.acm.org/10.1145/2364527.2364562},
 doi = {10.1145/2364527.2364562},
 acmid = {2364562},
 publisher = {ACM},
 address = {New York, NY, USA},
 keywords = {composability, gpu, haskell, work-stealing},
}

@article{kyriazis2012heterogeneous,
  title={Heterogeneous system architecture: A technical review},
  author={Kyriazis, George},
  journal={AMD Fusion Developer Summit},
  year={2012}
}

@inproceedings{Wang:2007:EAP:1250734.1250753,
 author = {Wang, Perry H. and Collins, Jamison D. and Chinya, Gautham N. and Jiang, Hong and Tian, Xinmin and Girkar, Milind and Yang, Nick Y. and Lueh, Guei-Yuan and Wang, Hong},
 title = {EXOCHI: Architecture and Programming Environment for a Heterogeneous Multi-core Multithreaded System},
 booktitle = {Proceedings of the 28th ACM SIGPLAN Conference on Programming Language Design and Implementation},
 series = {PLDI '07},
 year = {2007},
 isbn = {978-1-59593-633-2},
 location = {San Diego, California, USA},
 pages = {156--166},
 numpages = {11},
 url = {http://doi.acm.org/10.1145/1250734.1250753},
 doi = {10.1145/1250734.1250753},
 acmid = {1250753},
 publisher = {ACM},
 address = {New York, NY, USA},
 keywords = {GPU, heterogeneous multi-cores, openMP},
}

@inproceedings{Becchi:2010:DSL:1810479.1810498,
 author = {Becchi, Michela and Byna, Surendra and Cadambi, Srihari and Chakradhar, Srimat},
 title = {Data-aware Scheduling of Legacy Kernels on Heterogeneous Platforms with Distributed Memory},
 booktitle = {Proceedings of the Twenty-second Annual ACM Symposium on Parallelism in Algorithms and Architectures},
 series = {SPAA '10},
 year = {2010},
 isbn = {978-1-4503-0079-7},
 location = {Thira, Santorini, Greece},
 pages = {82--91},
 numpages = {10},
 url = {http://doi.acm.org/10.1145/1810479.1810498},
 doi = {10.1145/1810479.1810498},
 acmid = {1810498},
 publisher = {ACM},
 address = {New York, NY, USA},
 keywords = {accelerators, distributed memory, heterogeneous platforms, multi-core processors, runtime},
}

@article{paranjape2012heterogeneous,
 title={Heterogeneous Computing in the Cloud: Crunching Big Data and Democratizing HPC Access for the Life Sciences},
 author={Paranjape, Ketan and Hebert, Steve and Masson, Bob},
 journal={Intel White Paper},
 year={2012}
}

@article{hower2013sequential,
 title  = {Sequential consistency for heterogeneous-race-free},
 author = {Hower, Derek R and Beckmann, Bradford M and Gaster, Benedict R and Hechtman, Blake A and Hill, Mark D and Reinhardt, Steven K and Wood, David A},
 journal= {Memory Systems Performance and Correctness (MSPC)},
 year   = {2013}
}

@misc{claessen2009obsidian,
 title  = {Obsidian: GPU programming in Haskell},
 author = {Claessen, Koen and Sheeran, Mary and Svensson, Joel},
 year   = {2009}
}

@inproceedings{Leung:2010:MPM:1735688.1735698,
 author = {Leung, Allen and Vasilache, Nicolas and Meister, Beno\^{\i}t and Baskaran, Muthu and Wohlford, David and Bastoul, C{\'e}dric and Lethin, Richard},
 title = {A Mapping Path for multi-GPGPU Accelerated Computers from a Portable High Level Programming Abstraction},
 booktitle = {Proceedings of the 3rd Workshop on General-Purpose Computation on Graphics Processing Units},
 series = {GPGPU '10},
 year = {2010},
 isbn = {978-1-60558-935-0},
 location = {Pittsburgh, Pennsylvania, USA},
 pages = {51--61},
 numpages = {11},
 url = {http://doi.acm.org/10.1145/1735688.1735698},
 doi = {10.1145/1735688.1735698},
 acmid = {1735698},
 publisher = {ACM},
 address = {New York, NY, USA},
 keywords = {CUDA, GPGPU, automatic translation, compiler optimziation, parallelization, polyhedral model},
}

@inproceedings{Liu:2014:NOH:2636228.2636236,
 author = {Liu, Hai and Day, Laurence E. and Glew, Neal and Anderson, Todd A. and Barik, Rajkishore},
 title = {Native Offload of Haskell Repa Programs to Integrated GPUs},
 booktitle = {Proceedings of the 3rd ACM SIGPLAN Workshop on Functional High-performance Computing},
 series = {FHPC '14},
 year = {2014},
 isbn = {978-1-4503-3040-4},
 location = {Gothenburg, Sweden},
 pages = {87--97},
 numpages = {11},
 url = {http://doi.acm.org/10.1145/2636228.2636236},
 doi = {10.1145/2636228.2636236},
 acmid = {2636236},
 publisher = {ACM},
 address = {New York, NY, USA},
 keywords = {gpu programming, haskell, heterogeneous programming},
}

@inproceedings{Clifton-Everest:0014:EFC:2695475.2695485,
 author = {Clifton-Everest, Robert and Mcdonell, Trevor L. and Chakravarty, Manuel M. and Keller, Gabriele},
 title = {Embedding Foreign Code},
 booktitle = {Proceedings of the 16th International Symposium on Practical Aspects of Declarative Languages - Volume 8324},
 series = {PADL 2014},
 year = {2014},
 isbn = {978-3-319-04131-5},
 location = {San Diego, CA, USA},
 pages = {136--151},
 numpages = {16},
 url = {http://dx.doi.org/10.1007/978-3-319-04132-2_10},
 doi = {10.1007/978-3-319-04132-2_10},
 acmid = {2695485},
 publisher = {Springer-Verlag New York, Inc.},
 address = {New York, NY, USA},
}

@inproceedings{Fatahalian:2004:UEG:1058129.1058148,
 author = {Fatahalian, K. and Sugerman, J. and Hanrahan, P.},
 title = {Understanding the Efficiency of GPU Algorithms for Matrix-matrix Multiplication},
 booktitle = {Proceedings of the ACM SIGGRAPH/EUROGRAPHICS Conference on Graphics Hardware},
 series = {HWWS '04},
 year = {2004},
 isbn = {3-905673-15-0},
 location = {Grenoble, France},
 pages = {133--137},
 numpages = {5},
 url = {http://doi.acm.org/10.1145/1058129.1058148},
 doi = {10.1145/1058129.1058148},
 acmid = {1058148},
 publisher = {ACM},
 address = {New York, NY, USA},
}

@article{Gaster:2012:GPL:2780203.2780214,
 author = {Gaster, Benedict R. and Howes, Lee},
 title = {Can GPGPU Programming Be Liberated from the Data-Parallel Bottleneck?},
 journal = {Computer},
 issue_date = {August 2012},
 volume = {45},
 number = {8},
 month = aug,
 year = {2012},
 issn = {0018-9162},
 pages = {42--52},
 numpages = {11},
 url = {http://dx.doi.org/10.1109/MC.2012.257},
 doi = {10.1109/MC.2012.257},
 acmid = {2780214},
 publisher = {IEEE Computer Society Press},
 address = {Los Alamitos, CA, USA},
}

@inproceedings{Silberstein:2013:GIF:2451116.2451169,
 author = {Silberstein, Mark and Ford, Bryan and Keidar, Idit and Witchel, Emmett},
 title = {GPUfs: Integrating a File System with GPUs},
 booktitle = {Proceedings of the Eighteenth International Conference on Architectural Support for Programming Languages and Operating Systems},
 series = {ASPLOS '13},
 year = {2013},
 isbn = {978-1-4503-1870-9},
 location = {Houston, Texas, USA},
 pages = {485--498},
 numpages = {14},
 url = {http://doi.acm.org/10.1145/2451116.2451169},
 doi = {10.1145/2451116.2451169},
 acmid = {2451169},
 publisher = {ACM},
 address = {New York, NY, USA},
 keywords = {accelerators, file systems, gpgpus, operating systems design},
}

@article{Silberstein:2014:GCO:2692965.2656206,
 author = {Silberstein, Mark and Ford, Bryan and Witchel, Emmett},
 title = {GPUfs: The Case for Operating System Services on GPUs},
 journal = {Commun. ACM},
 issue_date = {December 2014},
 volume = {57},
 number = {12},
 month = nov,
 year = {2014},
 issn = {0001-0782},
 pages = {68--79},
 numpages = {12},
 url = {http://doi.acm.org/10.1145/2656206},
 doi = {10.1145/2656206},
 acmid = {2656206},
 publisher = {ACM},
 address = {New York, NY, USA},
}

@inproceedings{Benabderrahmane:2010:PMM:2175462.2175484,
 author = {Benabderrahmane, Mohamed-Walid and Pouchet, Louis-No\"{e}l and Cohen, Albert and Bastoul, C{\'e}dric},
 title = {The Polyhedral Model is More Widely Applicable Than You Think},
 booktitle = {Proceedings of the 19th Joint European Conference on Theory and Practice of Software, International Conference on Compiler Construction},
 series = {CC'10/ETAPS'10},
 year = {2010},
 isbn = {3-642-11969-7, 978-3-642-11969-9},
 location = {Paphos, Cyprus},
 pages = {283--303},
 numpages = {21},
 url = {http://dx.doi.org/10.1007/978-3-642-11970-5_16},
 doi = {10.1007/978-3-642-11970-5_16},
 acmid = {2175484},
 publisher = {Springer-Verlag},
 address = {Berlin, Heidelberg},
}

@article{Gaster:2015:HAH:2744295.2701618,
 author = {Gaster, Benedict R. and Hower, Derek and Howes, Lee},
 title = {HRF-Relaxed: Adapting HRF to the Complexities of Industrial Heterogeneous Memory Models},
 journal = {ACM Trans. Archit. Code Optim.},
 issue_date = {April 2015},
 volume = {12},
 number = {1},
 month = apr,
 year = {2015},
 issn = {1544-3566},
 pages = {7:1--7:26},
 articleno = {7},
 numpages = {26},
 url = {http://doi.acm.org/10.1145/2701618},
 doi = {10.1145/2701618},
 acmid = {2701618},
 publisher = {ACM},
 address = {New York, NY, USA},
 keywords = {Memory models, computer architecture, formal models, programming languages},
}

@inproceedings{Haidar:2015:WDS:2832080.2832085,
 author = {Haidar, Azzam and Jia, Yulu and Luszczek, Piotr and Tomov, Stanimire and YarKhan, Asim and Dongarra, Jack},
 title = {Weighted Dynamic Scheduling with Many Parallelism Grains for Offloading of Numerical Workloads to Multiple Varied Accelerators},
 booktitle = {Proceedings of the 6th Workshop on Latest Advances in Scalable Algorithms for Large-Scale Systems},
 series = {ScalA '15},
 year = {2015},
 isbn = {978-1-4503-4011-3},
 location = {Austin, Texas},
 pages = {5:1--5:8},
 articleno = {5},
 numpages = {8},
 url = {http://doi.acm.org/10.1145/2832080.2832085},
 doi = {10.1145/2832080.2832085},
 acmid = {2832085},
 publisher = {ACM},
 address = {New York, NY, USA},
 keywords = {dataflow scheduling, hardware accelerators, multi-grain parallelism},
}