How to avoid un-coalesced accesses in matrix multiplication CUDA kernel?
I am learning CUDA with the book 'Programming Massively Parallel Processors'. A practice problem from chapter 5 confuses me:
For tiled matrix multiplication out of possible range of values for
BLOCK_SIZE, for what values of BLOCK_SIZE will the kernel completely
avoid un-coalesced accesses to global memory? (you only need to consider square blocks)
On my understanding, BLOCK_SIZE does little to memory-coalescing. As long as threads within single warp access consecutive elements, we will have a coalesced accesses. I could not figure out where the kernel has un-coalesced accesses to global memory. Any hints from you guys?
Here is the kernel's source codes:
#define COMMON_WIDTH 512
#define ROW_LEFT 500
#define COL_RIGHT 250
#define K 1000
#define TILE_WIDTH 32
__device__ int D_ROW_LEFT = ROW_LEFT;
__device__ int D_COL_RIGHT = COL_RIGHT;
__device__ int D_K = K;
.....
__global__
void MatrixMatrixMultTiled(float *matrixLeft, float *matrixRight, float *output){
__shared__ float sMatrixLeft[TILE_WIDTH][TILE_WIDTH];
__shared__ float sMatrixRight[TILE_WIDTH][TILE_WIDTH];
int bx = blockIdx.x; int by = blockIdx.y;
int tx = threadIdx.x; int ty = threadIdx.y;
int col = bx * TILE_WIDTH + tx;
int row = by * TILE_WIDTH + ty;
float value = 0;
for (int i = 0; i < ceil(D_K/(float)TILE_WIDTH); ++i){
if (row < D_ROW_LEFT && row * D_K + i * TILE_WIDTH +tx < D_K){
sMatrixLeft[ty][tx] = matrixLeft[row * D_K + i * TILE_WIDTH +tx];
}
if (col < D_COL_RIGHT && (ty + i * TILE_WIDTH) * D_COL_RIGHT + col < D_K ){
sMatrixRight[ty][tx] = matrixRight[(ty + i * TILE_WIDTH) * D_COL_RIGHT + col];
}
__syncthreads();
for (int j = 0; j < TILE_WIDTH; j++){
value += sMatrixLeft[ty][j] * sMatrixRight[j][tx];
}
__syncthreads();
}
if (row < D_ROW_LEFT && col < D_COL_RIGHT ){
output[row * D_COL_RIGHT + col] = value;
}
}
parallel-processing cuda
add a comment |
I am learning CUDA with the book 'Programming Massively Parallel Processors'. A practice problem from chapter 5 confuses me:
For tiled matrix multiplication out of possible range of values for
BLOCK_SIZE, for what values of BLOCK_SIZE will the kernel completely
avoid un-coalesced accesses to global memory? (you only need to consider square blocks)
On my understanding, BLOCK_SIZE does little to memory-coalescing. As long as threads within single warp access consecutive elements, we will have a coalesced accesses. I could not figure out where the kernel has un-coalesced accesses to global memory. Any hints from you guys?
Here is the kernel's source codes:
#define COMMON_WIDTH 512
#define ROW_LEFT 500
#define COL_RIGHT 250
#define K 1000
#define TILE_WIDTH 32
__device__ int D_ROW_LEFT = ROW_LEFT;
__device__ int D_COL_RIGHT = COL_RIGHT;
__device__ int D_K = K;
.....
__global__
void MatrixMatrixMultTiled(float *matrixLeft, float *matrixRight, float *output){
__shared__ float sMatrixLeft[TILE_WIDTH][TILE_WIDTH];
__shared__ float sMatrixRight[TILE_WIDTH][TILE_WIDTH];
int bx = blockIdx.x; int by = blockIdx.y;
int tx = threadIdx.x; int ty = threadIdx.y;
int col = bx * TILE_WIDTH + tx;
int row = by * TILE_WIDTH + ty;
float value = 0;
for (int i = 0; i < ceil(D_K/(float)TILE_WIDTH); ++i){
if (row < D_ROW_LEFT && row * D_K + i * TILE_WIDTH +tx < D_K){
sMatrixLeft[ty][tx] = matrixLeft[row * D_K + i * TILE_WIDTH +tx];
}
if (col < D_COL_RIGHT && (ty + i * TILE_WIDTH) * D_COL_RIGHT + col < D_K ){
sMatrixRight[ty][tx] = matrixRight[(ty + i * TILE_WIDTH) * D_COL_RIGHT + col];
}
__syncthreads();
for (int j = 0; j < TILE_WIDTH; j++){
value += sMatrixLeft[ty][j] * sMatrixRight[j][tx];
}
__syncthreads();
}
if (row < D_ROW_LEFT && col < D_COL_RIGHT ){
output[row * D_COL_RIGHT + col] = value;
}
}
parallel-processing cuda
1
I would guess they mean TILE_WIDTH, not block size, That is a very old book and I wouldn't recommend it as a first guide. Hardware and software have evolved considerably since that book was written almost 10 years ago
– talonmies
Jan 2 at 9:59
@talonmies Thank you for your response. I think Robert Crovella clears my thought. Right now, I have the 3rd edition of the book 'Programming Massively Parallel Processors' published in 2016. Is it too old to use?
– pounch_doggie
Jan 3 at 4:08
add a comment |
I am learning CUDA with the book 'Programming Massively Parallel Processors'. A practice problem from chapter 5 confuses me:
For tiled matrix multiplication out of possible range of values for
BLOCK_SIZE, for what values of BLOCK_SIZE will the kernel completely
avoid un-coalesced accesses to global memory? (you only need to consider square blocks)
On my understanding, BLOCK_SIZE does little to memory-coalescing. As long as threads within single warp access consecutive elements, we will have a coalesced accesses. I could not figure out where the kernel has un-coalesced accesses to global memory. Any hints from you guys?
Here is the kernel's source codes:
#define COMMON_WIDTH 512
#define ROW_LEFT 500
#define COL_RIGHT 250
#define K 1000
#define TILE_WIDTH 32
__device__ int D_ROW_LEFT = ROW_LEFT;
__device__ int D_COL_RIGHT = COL_RIGHT;
__device__ int D_K = K;
.....
__global__
void MatrixMatrixMultTiled(float *matrixLeft, float *matrixRight, float *output){
__shared__ float sMatrixLeft[TILE_WIDTH][TILE_WIDTH];
__shared__ float sMatrixRight[TILE_WIDTH][TILE_WIDTH];
int bx = blockIdx.x; int by = blockIdx.y;
int tx = threadIdx.x; int ty = threadIdx.y;
int col = bx * TILE_WIDTH + tx;
int row = by * TILE_WIDTH + ty;
float value = 0;
for (int i = 0; i < ceil(D_K/(float)TILE_WIDTH); ++i){
if (row < D_ROW_LEFT && row * D_K + i * TILE_WIDTH +tx < D_K){
sMatrixLeft[ty][tx] = matrixLeft[row * D_K + i * TILE_WIDTH +tx];
}
if (col < D_COL_RIGHT && (ty + i * TILE_WIDTH) * D_COL_RIGHT + col < D_K ){
sMatrixRight[ty][tx] = matrixRight[(ty + i * TILE_WIDTH) * D_COL_RIGHT + col];
}
__syncthreads();
for (int j = 0; j < TILE_WIDTH; j++){
value += sMatrixLeft[ty][j] * sMatrixRight[j][tx];
}
__syncthreads();
}
if (row < D_ROW_LEFT && col < D_COL_RIGHT ){
output[row * D_COL_RIGHT + col] = value;
}
}
parallel-processing cuda
I am learning CUDA with the book 'Programming Massively Parallel Processors'. A practice problem from chapter 5 confuses me:
For tiled matrix multiplication out of possible range of values for
BLOCK_SIZE, for what values of BLOCK_SIZE will the kernel completely
avoid un-coalesced accesses to global memory? (you only need to consider square blocks)
On my understanding, BLOCK_SIZE does little to memory-coalescing. As long as threads within single warp access consecutive elements, we will have a coalesced accesses. I could not figure out where the kernel has un-coalesced accesses to global memory. Any hints from you guys?
Here is the kernel's source codes:
#define COMMON_WIDTH 512
#define ROW_LEFT 500
#define COL_RIGHT 250
#define K 1000
#define TILE_WIDTH 32
__device__ int D_ROW_LEFT = ROW_LEFT;
__device__ int D_COL_RIGHT = COL_RIGHT;
__device__ int D_K = K;
.....
__global__
void MatrixMatrixMultTiled(float *matrixLeft, float *matrixRight, float *output){
__shared__ float sMatrixLeft[TILE_WIDTH][TILE_WIDTH];
__shared__ float sMatrixRight[TILE_WIDTH][TILE_WIDTH];
int bx = blockIdx.x; int by = blockIdx.y;
int tx = threadIdx.x; int ty = threadIdx.y;
int col = bx * TILE_WIDTH + tx;
int row = by * TILE_WIDTH + ty;
float value = 0;
for (int i = 0; i < ceil(D_K/(float)TILE_WIDTH); ++i){
if (row < D_ROW_LEFT && row * D_K + i * TILE_WIDTH +tx < D_K){
sMatrixLeft[ty][tx] = matrixLeft[row * D_K + i * TILE_WIDTH +tx];
}
if (col < D_COL_RIGHT && (ty + i * TILE_WIDTH) * D_COL_RIGHT + col < D_K ){
sMatrixRight[ty][tx] = matrixRight[(ty + i * TILE_WIDTH) * D_COL_RIGHT + col];
}
__syncthreads();
for (int j = 0; j < TILE_WIDTH; j++){
value += sMatrixLeft[ty][j] * sMatrixRight[j][tx];
}
__syncthreads();
}
if (row < D_ROW_LEFT && col < D_COL_RIGHT ){
output[row * D_COL_RIGHT + col] = value;
}
}
parallel-processing cuda
parallel-processing cuda
edited Jan 2 at 16:39
talonmies
59.8k17131199
59.8k17131199
asked Jan 2 at 9:06


pounch_doggiepounch_doggie
32
32
1
I would guess they mean TILE_WIDTH, not block size, That is a very old book and I wouldn't recommend it as a first guide. Hardware and software have evolved considerably since that book was written almost 10 years ago
– talonmies
Jan 2 at 9:59
@talonmies Thank you for your response. I think Robert Crovella clears my thought. Right now, I have the 3rd edition of the book 'Programming Massively Parallel Processors' published in 2016. Is it too old to use?
– pounch_doggie
Jan 3 at 4:08
add a comment |
1
I would guess they mean TILE_WIDTH, not block size, That is a very old book and I wouldn't recommend it as a first guide. Hardware and software have evolved considerably since that book was written almost 10 years ago
– talonmies
Jan 2 at 9:59
@talonmies Thank you for your response. I think Robert Crovella clears my thought. Right now, I have the 3rd edition of the book 'Programming Massively Parallel Processors' published in 2016. Is it too old to use?
– pounch_doggie
Jan 3 at 4:08
1
1
I would guess they mean TILE_WIDTH, not block size, That is a very old book and I wouldn't recommend it as a first guide. Hardware and software have evolved considerably since that book was written almost 10 years ago
– talonmies
Jan 2 at 9:59
I would guess they mean TILE_WIDTH, not block size, That is a very old book and I wouldn't recommend it as a first guide. Hardware and software have evolved considerably since that book was written almost 10 years ago
– talonmies
Jan 2 at 9:59
@talonmies Thank you for your response. I think Robert Crovella clears my thought. Right now, I have the 3rd edition of the book 'Programming Massively Parallel Processors' published in 2016. Is it too old to use?
– pounch_doggie
Jan 3 at 4:08
@talonmies Thank you for your response. I think Robert Crovella clears my thought. Right now, I have the 3rd edition of the book 'Programming Massively Parallel Processors' published in 2016. Is it too old to use?
– pounch_doggie
Jan 3 at 4:08
add a comment |
1 Answer
1
active
oldest
votes
Your question is incomplete, since the code you have posted does not make any reference to BLOCK_SIZE
, and that is certainly at least very relevant to the question posed in the book. More generally, questions that pose a kernel without the launch configuration are often incomplete, since the launch configuration is often relevant to both the correctness and the behavior, of a kernel.
I've not re-read this portion of the book right at the moment. However I'll assume the kernel launch configuration includes a block dimension that is something like the following: (this information is absent from your question but should have been included, in my opinion, for a sensible question)
dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
dim3 dimGrid(...,...);
And I will assume the kernel launch is given by something like:
MatrixMatrixMultTiled<<<dimGrid, dimBlock>>>(...);
Your statement: "As long as threads within single warp access consecutive elements, we will have a coalesced accesses." is a reasonable working definition. Let's show that that is violated for some choices of BLOCK_SIZE
, given the above assumptions to cover over the gaps in your incomplete question.
Coalesced access is a term that applies to global memory accesses only. We will therefore ignore accesses to shared memory. We will also, for this discussion, ignore accesses to the __device__
variables such as D_ROW_LEFT
. (The access to those variables appears to be uniform. We can quibble about whether that constitutes coalesced access. My claim would be that it does constitute coalesced access, but we need not unpack that here.) Therefore we are left with just 3 "access" points:
matrixLeft[row * D_K + i * TILE_WIDTH +tx];
matrixRight[(ty + i * TILE_WIDTH) * D_COL_RIGHT + col];
output[row * D_COL_RIGHT + col]
Now, to pick an example, let's suppose BLOCK_SIZE
is 16. Will any of the above access points violate your statement "threads within single warp access consecutive elements"?
Let's start with the block (0,0). Therefore row
is equal to threadIdx.y
and col
is equal to threadIdx.x
. Let's consider the first warp in that block. Therefore the first 16 threads in that warp will have a threadIdx.y
value of 0, and their threadIdx.x
values will be increasing from 0..15. Likewise the second 16 threads in that warp will have a threadIdx.y
value of 1, and their threadIdx.x
values will be increasing from 0..15.
Now let's compute the actual index generated for the first access point above, across the warp. Let's assume we are on the first loop iteration, so i
is zero. Therefore this:
matrixLeft[row * D_K + i * TILE_WIDTH +tx];
reduces to:
matrixLeft[threadIdx.y * D_K + threadIdx.x];
D_K
here is just the device copy of the K
variable, which is 1000. Now let's evaluate the reduced index expression above across our selected warp (0) in our selected block (0,0):
warp lane: 0 1 2 3 4 5 6 .. 15 16 17 18 .. 31
threadIdx.x 0 1 2 3 4 5 6 15 0 1 2 15
threadIdx.y 0 0 0 0 0 0 0 0 1 1 1 1
index: 0 1 2 3 4 5 6 15 1000 1001 1002 1015
Therefore the generated index pattern here shows a discontinuity between the 16th and 17th thread in the warp, and the access pattern does not fit your previously stated condition:
"threads within single warp access consecutive elements"
and we do not have coalesced access in this case (at least, for float
quantities).
Thank you very much, Robert! Your explanation solves my confusion. I think that the problem tries to ask me about how to configure the matrix_multiplication kernel to avoid un-coalesces accesses when accessing global memory. To do so, we need to make sure that dimBlock.x is dividable by 32 (size of a warp). Am I right?
– pounch_doggie
Jan 3 at 4:16
yes, a block x dimension of 32 should give coalesced access
– Robert Crovella
Jan 3 at 4:43
add a comment |
Your Answer
StackExchange.ifUsing("editor", function () {
StackExchange.using("externalEditor", function () {
StackExchange.using("snippets", function () {
StackExchange.snippets.init();
});
});
}, "code-snippets");
StackExchange.ready(function() {
var channelOptions = {
tags: "".split(" "),
id: "1"
};
initTagRenderer("".split(" "), "".split(" "), channelOptions);
StackExchange.using("externalEditor", function() {
// Have to fire editor after snippets, if snippets enabled
if (StackExchange.settings.snippets.snippetsEnabled) {
StackExchange.using("snippets", function() {
createEditor();
});
}
else {
createEditor();
}
});
function createEditor() {
StackExchange.prepareEditor({
heartbeatType: 'answer',
autoActivateHeartbeat: false,
convertImagesToLinks: true,
noModals: true,
showLowRepImageUploadWarning: true,
reputationToPostImages: 10,
bindNavPrevention: true,
postfix: "",
imageUploader: {
brandingHtml: "Powered by u003ca class="icon-imgur-white" href="https://imgur.com/"u003eu003c/au003e",
contentPolicyHtml: "User contributions licensed under u003ca href="https://creativecommons.org/licenses/by-sa/3.0/"u003ecc by-sa 3.0 with attribution requiredu003c/au003e u003ca href="https://stackoverflow.com/legal/content-policy"u003e(content policy)u003c/au003e",
allowUrls: true
},
onDemand: true,
discardSelector: ".discard-answer"
,immediatelyShowMarkdownHelp:true
});
}
});
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f54003653%2fhow-to-avoid-un-coalesced-accesses-in-matrix-multiplication-cuda-kernel%23new-answer', 'question_page');
}
);
Post as a guest
Required, but never shown
1 Answer
1
active
oldest
votes
1 Answer
1
active
oldest
votes
active
oldest
votes
active
oldest
votes
Your question is incomplete, since the code you have posted does not make any reference to BLOCK_SIZE
, and that is certainly at least very relevant to the question posed in the book. More generally, questions that pose a kernel without the launch configuration are often incomplete, since the launch configuration is often relevant to both the correctness and the behavior, of a kernel.
I've not re-read this portion of the book right at the moment. However I'll assume the kernel launch configuration includes a block dimension that is something like the following: (this information is absent from your question but should have been included, in my opinion, for a sensible question)
dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
dim3 dimGrid(...,...);
And I will assume the kernel launch is given by something like:
MatrixMatrixMultTiled<<<dimGrid, dimBlock>>>(...);
Your statement: "As long as threads within single warp access consecutive elements, we will have a coalesced accesses." is a reasonable working definition. Let's show that that is violated for some choices of BLOCK_SIZE
, given the above assumptions to cover over the gaps in your incomplete question.
Coalesced access is a term that applies to global memory accesses only. We will therefore ignore accesses to shared memory. We will also, for this discussion, ignore accesses to the __device__
variables such as D_ROW_LEFT
. (The access to those variables appears to be uniform. We can quibble about whether that constitutes coalesced access. My claim would be that it does constitute coalesced access, but we need not unpack that here.) Therefore we are left with just 3 "access" points:
matrixLeft[row * D_K + i * TILE_WIDTH +tx];
matrixRight[(ty + i * TILE_WIDTH) * D_COL_RIGHT + col];
output[row * D_COL_RIGHT + col]
Now, to pick an example, let's suppose BLOCK_SIZE
is 16. Will any of the above access points violate your statement "threads within single warp access consecutive elements"?
Let's start with the block (0,0). Therefore row
is equal to threadIdx.y
and col
is equal to threadIdx.x
. Let's consider the first warp in that block. Therefore the first 16 threads in that warp will have a threadIdx.y
value of 0, and their threadIdx.x
values will be increasing from 0..15. Likewise the second 16 threads in that warp will have a threadIdx.y
value of 1, and their threadIdx.x
values will be increasing from 0..15.
Now let's compute the actual index generated for the first access point above, across the warp. Let's assume we are on the first loop iteration, so i
is zero. Therefore this:
matrixLeft[row * D_K + i * TILE_WIDTH +tx];
reduces to:
matrixLeft[threadIdx.y * D_K + threadIdx.x];
D_K
here is just the device copy of the K
variable, which is 1000. Now let's evaluate the reduced index expression above across our selected warp (0) in our selected block (0,0):
warp lane: 0 1 2 3 4 5 6 .. 15 16 17 18 .. 31
threadIdx.x 0 1 2 3 4 5 6 15 0 1 2 15
threadIdx.y 0 0 0 0 0 0 0 0 1 1 1 1
index: 0 1 2 3 4 5 6 15 1000 1001 1002 1015
Therefore the generated index pattern here shows a discontinuity between the 16th and 17th thread in the warp, and the access pattern does not fit your previously stated condition:
"threads within single warp access consecutive elements"
and we do not have coalesced access in this case (at least, for float
quantities).
Thank you very much, Robert! Your explanation solves my confusion. I think that the problem tries to ask me about how to configure the matrix_multiplication kernel to avoid un-coalesces accesses when accessing global memory. To do so, we need to make sure that dimBlock.x is dividable by 32 (size of a warp). Am I right?
– pounch_doggie
Jan 3 at 4:16
yes, a block x dimension of 32 should give coalesced access
– Robert Crovella
Jan 3 at 4:43
add a comment |
Your question is incomplete, since the code you have posted does not make any reference to BLOCK_SIZE
, and that is certainly at least very relevant to the question posed in the book. More generally, questions that pose a kernel without the launch configuration are often incomplete, since the launch configuration is often relevant to both the correctness and the behavior, of a kernel.
I've not re-read this portion of the book right at the moment. However I'll assume the kernel launch configuration includes a block dimension that is something like the following: (this information is absent from your question but should have been included, in my opinion, for a sensible question)
dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
dim3 dimGrid(...,...);
And I will assume the kernel launch is given by something like:
MatrixMatrixMultTiled<<<dimGrid, dimBlock>>>(...);
Your statement: "As long as threads within single warp access consecutive elements, we will have a coalesced accesses." is a reasonable working definition. Let's show that that is violated for some choices of BLOCK_SIZE
, given the above assumptions to cover over the gaps in your incomplete question.
Coalesced access is a term that applies to global memory accesses only. We will therefore ignore accesses to shared memory. We will also, for this discussion, ignore accesses to the __device__
variables such as D_ROW_LEFT
. (The access to those variables appears to be uniform. We can quibble about whether that constitutes coalesced access. My claim would be that it does constitute coalesced access, but we need not unpack that here.) Therefore we are left with just 3 "access" points:
matrixLeft[row * D_K + i * TILE_WIDTH +tx];
matrixRight[(ty + i * TILE_WIDTH) * D_COL_RIGHT + col];
output[row * D_COL_RIGHT + col]
Now, to pick an example, let's suppose BLOCK_SIZE
is 16. Will any of the above access points violate your statement "threads within single warp access consecutive elements"?
Let's start with the block (0,0). Therefore row
is equal to threadIdx.y
and col
is equal to threadIdx.x
. Let's consider the first warp in that block. Therefore the first 16 threads in that warp will have a threadIdx.y
value of 0, and their threadIdx.x
values will be increasing from 0..15. Likewise the second 16 threads in that warp will have a threadIdx.y
value of 1, and their threadIdx.x
values will be increasing from 0..15.
Now let's compute the actual index generated for the first access point above, across the warp. Let's assume we are on the first loop iteration, so i
is zero. Therefore this:
matrixLeft[row * D_K + i * TILE_WIDTH +tx];
reduces to:
matrixLeft[threadIdx.y * D_K + threadIdx.x];
D_K
here is just the device copy of the K
variable, which is 1000. Now let's evaluate the reduced index expression above across our selected warp (0) in our selected block (0,0):
warp lane: 0 1 2 3 4 5 6 .. 15 16 17 18 .. 31
threadIdx.x 0 1 2 3 4 5 6 15 0 1 2 15
threadIdx.y 0 0 0 0 0 0 0 0 1 1 1 1
index: 0 1 2 3 4 5 6 15 1000 1001 1002 1015
Therefore the generated index pattern here shows a discontinuity between the 16th and 17th thread in the warp, and the access pattern does not fit your previously stated condition:
"threads within single warp access consecutive elements"
and we do not have coalesced access in this case (at least, for float
quantities).
Thank you very much, Robert! Your explanation solves my confusion. I think that the problem tries to ask me about how to configure the matrix_multiplication kernel to avoid un-coalesces accesses when accessing global memory. To do so, we need to make sure that dimBlock.x is dividable by 32 (size of a warp). Am I right?
– pounch_doggie
Jan 3 at 4:16
yes, a block x dimension of 32 should give coalesced access
– Robert Crovella
Jan 3 at 4:43
add a comment |
Your question is incomplete, since the code you have posted does not make any reference to BLOCK_SIZE
, and that is certainly at least very relevant to the question posed in the book. More generally, questions that pose a kernel without the launch configuration are often incomplete, since the launch configuration is often relevant to both the correctness and the behavior, of a kernel.
I've not re-read this portion of the book right at the moment. However I'll assume the kernel launch configuration includes a block dimension that is something like the following: (this information is absent from your question but should have been included, in my opinion, for a sensible question)
dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
dim3 dimGrid(...,...);
And I will assume the kernel launch is given by something like:
MatrixMatrixMultTiled<<<dimGrid, dimBlock>>>(...);
Your statement: "As long as threads within single warp access consecutive elements, we will have a coalesced accesses." is a reasonable working definition. Let's show that that is violated for some choices of BLOCK_SIZE
, given the above assumptions to cover over the gaps in your incomplete question.
Coalesced access is a term that applies to global memory accesses only. We will therefore ignore accesses to shared memory. We will also, for this discussion, ignore accesses to the __device__
variables such as D_ROW_LEFT
. (The access to those variables appears to be uniform. We can quibble about whether that constitutes coalesced access. My claim would be that it does constitute coalesced access, but we need not unpack that here.) Therefore we are left with just 3 "access" points:
matrixLeft[row * D_K + i * TILE_WIDTH +tx];
matrixRight[(ty + i * TILE_WIDTH) * D_COL_RIGHT + col];
output[row * D_COL_RIGHT + col]
Now, to pick an example, let's suppose BLOCK_SIZE
is 16. Will any of the above access points violate your statement "threads within single warp access consecutive elements"?
Let's start with the block (0,0). Therefore row
is equal to threadIdx.y
and col
is equal to threadIdx.x
. Let's consider the first warp in that block. Therefore the first 16 threads in that warp will have a threadIdx.y
value of 0, and their threadIdx.x
values will be increasing from 0..15. Likewise the second 16 threads in that warp will have a threadIdx.y
value of 1, and their threadIdx.x
values will be increasing from 0..15.
Now let's compute the actual index generated for the first access point above, across the warp. Let's assume we are on the first loop iteration, so i
is zero. Therefore this:
matrixLeft[row * D_K + i * TILE_WIDTH +tx];
reduces to:
matrixLeft[threadIdx.y * D_K + threadIdx.x];
D_K
here is just the device copy of the K
variable, which is 1000. Now let's evaluate the reduced index expression above across our selected warp (0) in our selected block (0,0):
warp lane: 0 1 2 3 4 5 6 .. 15 16 17 18 .. 31
threadIdx.x 0 1 2 3 4 5 6 15 0 1 2 15
threadIdx.y 0 0 0 0 0 0 0 0 1 1 1 1
index: 0 1 2 3 4 5 6 15 1000 1001 1002 1015
Therefore the generated index pattern here shows a discontinuity between the 16th and 17th thread in the warp, and the access pattern does not fit your previously stated condition:
"threads within single warp access consecutive elements"
and we do not have coalesced access in this case (at least, for float
quantities).
Your question is incomplete, since the code you have posted does not make any reference to BLOCK_SIZE
, and that is certainly at least very relevant to the question posed in the book. More generally, questions that pose a kernel without the launch configuration are often incomplete, since the launch configuration is often relevant to both the correctness and the behavior, of a kernel.
I've not re-read this portion of the book right at the moment. However I'll assume the kernel launch configuration includes a block dimension that is something like the following: (this information is absent from your question but should have been included, in my opinion, for a sensible question)
dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
dim3 dimGrid(...,...);
And I will assume the kernel launch is given by something like:
MatrixMatrixMultTiled<<<dimGrid, dimBlock>>>(...);
Your statement: "As long as threads within single warp access consecutive elements, we will have a coalesced accesses." is a reasonable working definition. Let's show that that is violated for some choices of BLOCK_SIZE
, given the above assumptions to cover over the gaps in your incomplete question.
Coalesced access is a term that applies to global memory accesses only. We will therefore ignore accesses to shared memory. We will also, for this discussion, ignore accesses to the __device__
variables such as D_ROW_LEFT
. (The access to those variables appears to be uniform. We can quibble about whether that constitutes coalesced access. My claim would be that it does constitute coalesced access, but we need not unpack that here.) Therefore we are left with just 3 "access" points:
matrixLeft[row * D_K + i * TILE_WIDTH +tx];
matrixRight[(ty + i * TILE_WIDTH) * D_COL_RIGHT + col];
output[row * D_COL_RIGHT + col]
Now, to pick an example, let's suppose BLOCK_SIZE
is 16. Will any of the above access points violate your statement "threads within single warp access consecutive elements"?
Let's start with the block (0,0). Therefore row
is equal to threadIdx.y
and col
is equal to threadIdx.x
. Let's consider the first warp in that block. Therefore the first 16 threads in that warp will have a threadIdx.y
value of 0, and their threadIdx.x
values will be increasing from 0..15. Likewise the second 16 threads in that warp will have a threadIdx.y
value of 1, and their threadIdx.x
values will be increasing from 0..15.
Now let's compute the actual index generated for the first access point above, across the warp. Let's assume we are on the first loop iteration, so i
is zero. Therefore this:
matrixLeft[row * D_K + i * TILE_WIDTH +tx];
reduces to:
matrixLeft[threadIdx.y * D_K + threadIdx.x];
D_K
here is just the device copy of the K
variable, which is 1000. Now let's evaluate the reduced index expression above across our selected warp (0) in our selected block (0,0):
warp lane: 0 1 2 3 4 5 6 .. 15 16 17 18 .. 31
threadIdx.x 0 1 2 3 4 5 6 15 0 1 2 15
threadIdx.y 0 0 0 0 0 0 0 0 1 1 1 1
index: 0 1 2 3 4 5 6 15 1000 1001 1002 1015
Therefore the generated index pattern here shows a discontinuity between the 16th and 17th thread in the warp, and the access pattern does not fit your previously stated condition:
"threads within single warp access consecutive elements"
and we do not have coalesced access in this case (at least, for float
quantities).
answered Jan 2 at 17:34
Robert CrovellaRobert Crovella
96.9k5110152
96.9k5110152
Thank you very much, Robert! Your explanation solves my confusion. I think that the problem tries to ask me about how to configure the matrix_multiplication kernel to avoid un-coalesces accesses when accessing global memory. To do so, we need to make sure that dimBlock.x is dividable by 32 (size of a warp). Am I right?
– pounch_doggie
Jan 3 at 4:16
yes, a block x dimension of 32 should give coalesced access
– Robert Crovella
Jan 3 at 4:43
add a comment |
Thank you very much, Robert! Your explanation solves my confusion. I think that the problem tries to ask me about how to configure the matrix_multiplication kernel to avoid un-coalesces accesses when accessing global memory. To do so, we need to make sure that dimBlock.x is dividable by 32 (size of a warp). Am I right?
– pounch_doggie
Jan 3 at 4:16
yes, a block x dimension of 32 should give coalesced access
– Robert Crovella
Jan 3 at 4:43
Thank you very much, Robert! Your explanation solves my confusion. I think that the problem tries to ask me about how to configure the matrix_multiplication kernel to avoid un-coalesces accesses when accessing global memory. To do so, we need to make sure that dimBlock.x is dividable by 32 (size of a warp). Am I right?
– pounch_doggie
Jan 3 at 4:16
Thank you very much, Robert! Your explanation solves my confusion. I think that the problem tries to ask me about how to configure the matrix_multiplication kernel to avoid un-coalesces accesses when accessing global memory. To do so, we need to make sure that dimBlock.x is dividable by 32 (size of a warp). Am I right?
– pounch_doggie
Jan 3 at 4:16
yes, a block x dimension of 32 should give coalesced access
– Robert Crovella
Jan 3 at 4:43
yes, a block x dimension of 32 should give coalesced access
– Robert Crovella
Jan 3 at 4:43
add a comment |
Thanks for contributing an answer to Stack Overflow!
- Please be sure to answer the question. Provide details and share your research!
But avoid …
- Asking for help, clarification, or responding to other answers.
- Making statements based on opinion; back them up with references or personal experience.
To learn more, see our tips on writing great answers.
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f54003653%2fhow-to-avoid-un-coalesced-accesses-in-matrix-multiplication-cuda-kernel%23new-answer', 'question_page');
}
);
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
1
I would guess they mean TILE_WIDTH, not block size, That is a very old book and I wouldn't recommend it as a first guide. Hardware and software have evolved considerably since that book was written almost 10 years ago
– talonmies
Jan 2 at 9:59
@talonmies Thank you for your response. I think Robert Crovella clears my thought. Right now, I have the 3rd edition of the book 'Programming Massively Parallel Processors' published in 2016. Is it too old to use?
– pounch_doggie
Jan 3 at 4:08