#!/usr/bin/perl use strict; use warnings; use ExtUtils::nvcc; use Inline C => DATA => ExtUtils::nvcc::Inline; # Generate a series of 100 sequential values and pack them # as an array of floats: my $data = pack('f*', 1..100); # Call the Perl-callable wrapper to the CUDA kernel: cuda_test($data); # Print the results print "Got ", join (', ', unpack('f*', $data)), "\n"; END { # I was having trouble with memory leaks. This messgae # indicates that the segmentation fault occurrs after # the end of the script's execution. (However, it no # longer appears to be a problem! :-) print "Really done!\n"; } __END__ __C__ // This is a very simple CUDA kernel that triples the value of the // global data associated with the location at threadIdx.x. NOTE: this // is a particularly good example of BAD programming - it should be // more defensive. It is just a proof of concept, to show that you can // indeed write CUDA kernels using Inline::C. __global__ void triple(float * data_g) { data_g[threadIdx.x] *= 3; } // NOTE: Do not make such a kernel a regular habit. Generally, copying // data to and from the device is very, very slow (compared with all // other CUDA operations). This is just a proof of concept. void cuda_test(char * input) { // Inline::C knows how to massage a Perl scalar into a char // array (pointer), which I can easily cast as a float pointer: float * data = (float * ) input; // Allocate the memory of the device: float * data_d; unsigned int data_bytes = sizeof(float) * 100; cudaMalloc(&data_d, data_bytes); // Copy the host memory to the device: cudaMemcpy(data_d, data, data_bytes, cudaMemcpyHostToDevice); // Print a status indicator and execuate the kernel printf("Trippling values via CUDA\n"); // Execute the kernel: triple <<<1, 100>>>(data_d); // Copy the contents back to the Perl scalar: cudaMemcpy(data, data_d, data_bytes, cudaMemcpyDeviceToHost); // Free the device memory cudaFree(data_d); }