- Joined
- 12/14/10
- Messages
- 131
- Points
- 38
I read the very nice article “C++ Multithreading in Boost” on Quantnet, and tried my own simple example.
The n-word checksum of the consecutive integers 0, 1, … is computed. For example, for n = 3 it looks like
0 1 2
3 4 5
6 7 8
: : :
The strange thing is, that on a quad-core computer, the performance initially decreased with the number of threads and gradually increased to optimal performance around 30 threads. Also, the run time was unstable for small numbers of threads (>=2), for example, occasionally 4 threads runs fast as one would expect on a quad-core (as fast as 17.44s in the example below).
My question: Is there a bug in my code, or is there an explanation for this behaviour?
The output for the case n = 1 follows, and the source code is below.
>checksum.exe 9999999999 1 1
4 cores/processors detected on this system.
----- 1 word (8 byte) checksum of 0 through 9999999999 is: 0x00000002540be3ff-----
1 thread(s) used in parallel
61.24 s
...
2 thread(s) used in parallel
93.63 s
3 thread(s) used in parallel
76.76 s
4 thread(s) used in parallel
49.33 s
5 thread(s) used in parallel
20.48 s
10 thread(s) used in parallel
45.35 s
20 thread(s) used in parallel
25.83 s
30 thread(s) used in parallel
21.70 s
40 thread(s) used in parallel
23.91 s
The n-word checksum of the consecutive integers 0, 1, … is computed. For example, for n = 3 it looks like
0 1 2
3 4 5
6 7 8
: : :
The strange thing is, that on a quad-core computer, the performance initially decreased with the number of threads and gradually increased to optimal performance around 30 threads. Also, the run time was unstable for small numbers of threads (>=2), for example, occasionally 4 threads runs fast as one would expect on a quad-core (as fast as 17.44s in the example below).
My question: Is there a bug in my code, or is there an explanation for this behaviour?
The output for the case n = 1 follows, and the source code is below.
>checksum.exe 9999999999 1 1
4 cores/processors detected on this system.
----- 1 word (8 byte) checksum of 0 through 9999999999 is: 0x00000002540be3ff-----
1 thread(s) used in parallel
61.24 s
...
2 thread(s) used in parallel
93.63 s
3 thread(s) used in parallel
76.76 s
4 thread(s) used in parallel
49.33 s
5 thread(s) used in parallel
20.48 s
10 thread(s) used in parallel
45.35 s
20 thread(s) used in parallel
25.83 s
30 thread(s) used in parallel
21.70 s
40 thread(s) used in parallel
23.91 s
Code:
#include <iostream>
#include <vector>
#include <boost/format.hpp>
#include <boost/progress.hpp>
#include <boost/thread.hpp>
#include <boost/lexical_cast.hpp>
using namespace std;
using boost::format;
using boost::io::group;
using boost::progress_timer;
using boost::thread;
using boost::thread_group;
using boost::lexical_cast;
/** class for computing checksums */
template<typename IntType = long>
class checksum
{
public:
/// n_ is the number of elements of type IntType in each checksum
checksum(unsigned n_ = 1) : n(n_), value(n) {}
/// Adds a single word to the current checksum
template<typename RanIter>
void operator()(RanIter begin);
/// get current checksum
vector<IntType> getChecksum() const {return value;}
private:
const unsigned n;
vector<IntType> value;
};
template<typename IntType>
template<typename RanIter>
inline void checksum<IntType>::operator()(RanIter begin)
{
for (unsigned i = 0; i < n; ++i, ++begin)
value[i] ^= *begin;
}
/** class used to create threads */
template<typename IntType = long>
class consecutive_checksum
{
public:
consecutive_checksum(IntType length_ = 1, IntType start = 0, unsigned words = 1) : length(length_), counter(start), n(words), sum(n) {}
void operator()();
vector<IntType> getChecksum() const {return sum.getChecksum();}
private:
const IntType length;
IntType counter;
const unsigned n;
checksum<IntType> sum;
};
template<typename IntType>
void consecutive_checksum<IntType>::operator()()
{
vector<IntType> values(n);
for (IntType j = 0; j < length; ++j)
{
for (unsigned k = 0; k < n; ++k)
values[k] = counter++;
sum(values.begin());
}
}
/** main thread */
int main(int argc, char * argv[])
{
if (argc != 4)
{
cerr << "Incorrect number of arguments.\n";
exit(EXIT_FAILURE);
}
typedef long long IntType; /// integer type to use for counters
const IntType i = lexical_cast<IntType>(argv[1]);
const unsigned n = lexical_cast<unsigned>(argv[2]);
const unsigned threads = lexical_cast<unsigned>(argv[3]);
cout << endl << format("%1% cores/processors detected on this system.\n\n")
% thread::hardware_concurrency();
progress_timer timer;
IntType iterationsPerThread = IntType( ceil(double(i) / threads) ); /// round up
thread_group threadSet;
typedef consecutive_checksum<IntType> consCSType;
vector< shared_ptr<consCSType> > csum;
for (unsigned m = 0; m < threads; ++m)
{
IntType iterations( m < threads - 1 ? iterationsPerThread : i - iterationsPerThread * (threads - 1) );
shared_ptr<consCSType> p( new consCSType(iterations, iterationsPerThread * m * n, n) );
csum.push_back(p);
/// create a thread using a reference so that the class gets updated
threadSet.create_thread( ref(*csum[m].get()) );
}
/// wait for all the threads to complete
threadSet.join_all();
/// combine the checksums of each thread
vector<IntType> result(n);
for (unsigned m = 0; m < threads; ++m)
{
vector<IntType> cs( csum[m] -> getChecksum() );
for (unsigned k = 0; k < n; ++k)
result[k] ^= cs[k];
}
cout << format("----- %4% word (%5% byte) checksum of %2% through %3% is: ")
% 1 % 0 % (i * n) % n % (n * sizeof(IntType));
/// output the checksum in hexadecimal
cout << "0x";
for (vector<IntType>::const_iterator iter = result.begin(); iter != result.end(); ++iter)
{
cout << format("%1%") % group(hex, setw(sizeof(IntType) * 2), setfill('0'), *iter);
}
cout << "-----\n";
cout << format("%1% thread(s) used in parallel\n") % threads;
return EXIT_SUCCESS;
}