partitioner.h

00001 /*
00002     Copyright 2005-2011 Intel Corporation.  All Rights Reserved.
00003 
00004     The source code contained or described herein and all documents related
00005     to the source code ("Material") are owned by Intel Corporation or its
00006     suppliers or licensors.  Title to the Material remains with Intel
00007     Corporation or its suppliers and licensors.  The Material is protected
00008     by worldwide copyright laws and treaty provisions.  No part of the
00009     Material may be used, copied, reproduced, modified, published, uploaded,
00010     posted, transmitted, distributed, or disclosed in any way without
00011     Intel's prior express written permission.
00012 
00013     No license under any patent, copyright, trade secret or other
00014     intellectual property right is granted to or conferred upon you by
00015     disclosure or delivery of the Materials, either expressly, by
00016     implication, inducement, estoppel or otherwise.  Any license under such
00017     intellectual property rights must be express and approved by Intel in
00018     writing.
00019 */
00020 
00021 #ifndef __TBB_partitioner_H
00022 #define __TBB_partitioner_H
00023 
00024 #ifndef __TBB_INITIAL_CHUNKS
00025 #define __TBB_INITIAL_CHUNKS 2
00026 #endif
00027 #ifndef __TBB_RANGE_POOL_CAPACITY
00028 #define __TBB_RANGE_POOL_CAPACITY 8
00029 #endif
00030 #ifndef __TBB_INIT_DEPTH
00031 #define __TBB_INIT_DEPTH 5
00032 #endif
00033 
00034 #include "task.h"
00035 #include "aligned_space.h"
00036 #include "atomic.h"
00037 
00038 #if defined(_MSC_VER) && !defined(__INTEL_COMPILER)
00039     // Workaround for overzealous compiler warnings 
00040     #pragma warning (push)
00041     #pragma warning (disable: 4244)
00042 #endif
00043 
00044 namespace tbb {
00045 
00046 class auto_partitioner;
00047 class simple_partitioner;
00048 class affinity_partitioner;
00049 namespace interface6 {
00050     namespace internal {
00051         class affinity_partition_type;
00052     }
00053 }
00054 
00055 namespace internal {
00056 size_t __TBB_EXPORTED_FUNC get_initial_auto_partitioner_divisor();
00057 
00059 class affinity_partitioner_base_v3: no_copy {
00060     friend class tbb::affinity_partitioner;
00061     friend class tbb::interface6::internal::affinity_partition_type;
00063 
00064     affinity_id* my_array;
00066     size_t my_size;
00068     affinity_partitioner_base_v3() : my_array(NULL), my_size(0) {}
00070     ~affinity_partitioner_base_v3() {resize(0);}
00072 
00073     void __TBB_EXPORTED_METHOD resize( unsigned factor );
00074 };
00075 
00077 class partition_type_base {
00078 public:
00079     void set_affinity( task & ) {}
00080     void note_affinity( task::affinity_id ) {}
00081     task* continue_after_execute_range() {return NULL;}
00082     bool decide_whether_to_delay() {return false;}
00083     void spawn_or_delay( bool, task& b ) {
00084         task::spawn(b);
00085     }
00086 };
00087 
00088 template<typename Range, typename Body, typename Partitioner> class start_scan;
00089 
00090 } // namespace internal
00092 
00093 namespace serial {
00094 namespace interface6 {
00095 template<typename Range, typename Body, typename Partitioner> class start_for;
00096 }
00097 }
00098 
00099 namespace interface6 {
00101 namespace internal {
00102 using namespace tbb::internal;
00103 template<typename Range, typename Body, typename Partitioner> class start_for;
00104 template<typename Range, typename Body, typename Partitioner> class start_reduce;
00105 
00107 class flag_task: public task {
00108 public:
00109     tbb::atomic<bool> child_stolen;
00110     flag_task() { child_stolen = false; }
00111     task* execute() { return NULL; }
00112 };
00113 
00115 class signal_task: public task {
00116 public:
00117     task* execute() {
00118         if( is_stolen_task() ) {
00119             static_cast<flag_task*>(parent())->child_stolen = true;
00120         }
00121         return NULL;
00122     }
00123 };
00124 
00128 typedef unsigned char depth_t;
00129 
00131 template <typename T, depth_t MaxCapacity>
00132 class range_vector {
00133     depth_t my_head;
00134     depth_t my_tail;
00135     depth_t my_size;
00136     depth_t my_depth[MaxCapacity]; // relative depths of stored ranges
00137     tbb::aligned_space<T, MaxCapacity> my_pool;
00138 
00139 public:
00141     range_vector(const T& elem) : my_head(0), my_tail(0), my_size(1) {
00142         my_depth[0] = 0;
00143         new( my_pool.begin() ) T(elem);//TODO: std::move?
00144     }
00145     ~range_vector() {
00146         while( !empty() ) pop_back();
00147     }
00148     bool empty() const { return my_size == 0; }
00149     depth_t size() const { return my_size; }
00152     void split_to_fill(depth_t max_depth) {
00153         while( my_size < MaxCapacity && my_depth[my_head] < max_depth
00154           && my_pool.begin()[my_head].is_divisible() ) {
00155             depth_t prev = my_head;
00156             my_head = (my_head + 1) % MaxCapacity;
00157             new(my_pool.begin()+my_head) T(my_pool.begin()[prev]); // copy TODO: std::move?
00158             my_pool.begin()[prev].~T(); // instead of assignment
00159             new(my_pool.begin()+prev) T(my_pool.begin()[my_head], split()); // do 'inverse' split
00160             my_depth[my_head] = ++my_depth[prev];
00161             my_size++;
00162         }
00163     }
00164     void pop_back() {
00165         __TBB_ASSERT(my_size > 0, "range_vector::pop_back() with empty size");
00166         my_pool.begin()[my_head].~T();
00167         my_size--;
00168         my_head = (my_head + MaxCapacity - 1) % MaxCapacity;
00169     }
00170     void pop_front() {
00171         __TBB_ASSERT(my_size > 0, "range_vector::pop_front() with empty size");
00172         my_pool.begin()[my_tail].~T();
00173         my_size--;
00174         my_tail = (my_tail + 1) % MaxCapacity;
00175     }
00176     T& back() {
00177         __TBB_ASSERT(my_size > 0, "range_vector::back() with empty size");
00178         return my_pool.begin()[my_head];
00179     }
00180     T& front() {
00181         __TBB_ASSERT(my_size > 0, "range_vector::front() with empty size");
00182         return my_pool.begin()[my_tail];
00183     }
00185     depth_t front_depth() {
00186         __TBB_ASSERT(my_size > 0, "range_vector::front_depth() with empty size");
00187         return my_depth[my_tail];
00188     }
00189 };
00190 
00192 template <typename Partition>
00193 struct partition_type_base {
00194     // decision makers
00195     void set_affinity( task & ) {}
00196     void note_affinity( task::affinity_id ) {}
00197     bool check_being_stolen(task &) { return false; } // part of old should_execute_range()
00198     bool check_for_demand(task &) { return false; }
00199     bool divisions_left() { return true; } // part of old should_execute_range()
00200     bool should_create_trap() { return false; }
00201     depth_t max_depth() { return 0; }
00202     void align_depth(depth_t) { }
00203     // common function blocks
00204     Partition& derived() { return *static_cast<Partition*>(this); }
00205     template<typename StartType>
00206     flag_task* split_work(StartType &start) {
00207         flag_task* parent_ptr = start.create_continuation(); // the type here is to express expectation
00208         start.set_parent(parent_ptr);
00209         parent_ptr->set_ref_count(2);
00210         StartType& right_work = *new( parent_ptr->allocate_child() ) StartType(start, split());
00211         start.spawn(right_work);
00212         return parent_ptr;
00213     }
00214     template<typename StartType, typename Range>
00215     void execute(StartType &start, Range &range) {
00216         // The algorithm in a few words ([]-denotes calls to decision mathods of partitioner):
00217         // [If this task is stolen, adjust depth and divisions if necessary, set flag].
00218         // If range is divisible {
00219         //    Spread the work while [initial divisions left];
00220         //    Create trap task [if necessary];
00221         // }
00222         // If not divisible or [max depth is reached], execute, else do the range pool part
00223         task* parent_ptr = start.parent();
00224         if( range.is_divisible() ) {
00225             if( derived().divisions_left() )
00226                 do parent_ptr = split_work(start); // split until divisions_left()
00227                 while( range.is_divisible() && derived().divisions_left() );
00228             if( derived().should_create_trap() ) { // only for range pool
00229                 if( parent_ptr->ref_count() > 1 ) { // create new parent if necessary
00230                     parent_ptr = start.create_continuation();
00231                     start.set_parent(parent_ptr);
00232                 } else __TBB_ASSERT(parent_ptr->ref_count() == 1, NULL);
00233                 parent_ptr->set_ref_count(2); // safe because parent has only one reference
00234                 signal_task& right_signal = *new( parent_ptr->allocate_child() ) signal_task();
00235                 start.spawn(right_signal); // pure signal is to avoid deep recursion in the end
00236             }
00237         }
00238         if( !range.is_divisible() || !derived().max_depth() )
00239             start.run_body( range ); // simple partitioner goes always here
00240         else { // do range pool
00241             internal::range_vector<Range, Partition::range_pool_size> range_pool(range);
00242             do {
00243                 range_pool.split_to_fill(derived().max_depth()); // fill range pool
00244                 if( derived().check_for_demand( start ) ) {
00245                     if( range_pool.size() > 1 ) {
00246                         parent_ptr = start.create_continuation();
00247                         start.set_parent(parent_ptr);
00248                         parent_ptr->set_ref_count(2);
00249                         StartType& right_work = *new( parent_ptr->allocate_child() ) StartType(start, range_pool.front(), range_pool.front_depth());
00250                         start.spawn(right_work);
00251                         range_pool.pop_front();
00252                         continue;
00253                     }
00254                     if( range_pool.back().is_divisible() ) // was not enough depth to fork a task
00255                         continue; // note: check_for_demand() should guarantee increasing max_depth() next time
00256                 }
00257                 start.run_body( range_pool.back() );
00258                 range_pool.pop_back();  
00259             } while( !range_pool.empty() && !start.is_cancelled() );
00260         }
00261     }
00262 };
00263 
00265 template <typename Partition>
00266 struct auto_partition_type_base : partition_type_base<Partition> {
00267     size_t my_divisor;
00268     depth_t my_max_depth;
00269     auto_partition_type_base() : my_max_depth(__TBB_INIT_DEPTH) {
00270         my_divisor = tbb::internal::get_initial_auto_partitioner_divisor()*__TBB_INITIAL_CHUNKS/4;
00271         __TBB_ASSERT(my_divisor, "initial value of get_initial_auto_partitioner_divisor() is not valid");
00272     }
00273     auto_partition_type_base(auto_partition_type_base &src, split) {
00274         my_max_depth = src.my_max_depth;
00275 #if __TBB_INITIAL_TASK_IMBALANCE
00276         if( src.my_divisor <= 1 ) my_divisor = 0;
00277         else my_divisor = src.my_divisor = (src.my_divisor+1u) / 2u;
00278 #else
00279         my_divisor = src.my_divisor / 2u;
00280         src.my_divisor = src.my_divisor - my_divisor; // TODO: check the effect separately
00281         if(my_divisor) src.my_max_depth += static_cast<depth_t>(__TBB_Log2(src.my_divisor/my_divisor));
00282 #endif
00283     }
00284     bool check_being_stolen( task &t) { // part of old should_execute_range()
00285         if( !my_divisor ) {
00286             my_divisor = 1; // todo: replace by on-stack flag (partition_state's member)?
00287             if( t.is_stolen_task() ) {
00288 #if TBB_USE_EXCEPTIONS
00289                 // RTTI is available, check whether the cast is valid
00290                 __TBB_ASSERT(dynamic_cast<flag_task*>(t.parent()), 0);
00291                 // correctess of the cast rely on avoiding the root task for which:
00292                 // - initial value of my_divisor != 0 (protected by separate assertion)
00293                 // - is_stolen_task() always return false for the root task.
00294 #endif
00295                 static_cast<flag_task*>(t.parent())->child_stolen = true;
00296                 my_max_depth++;
00297                 return true;
00298             }
00299         }
00300         return false;
00301     }
00302     bool divisions_left() { // part of old should_execute_range()
00303         if( my_divisor > 1 ) return true;
00304         if( my_divisor && my_max_depth > 1 ) { // can split the task and once more internally. TODO: on-stack flag instead
00305             // keep same fragmentation while splitting for the local task pool
00306             my_max_depth--;
00307             my_divisor = 0;
00308             return true;
00309         } else return false;
00310     }
00311     bool should_create_trap() {
00312         return my_divisor > 0;
00313     }
00314     bool check_for_demand(task &t) {
00315         if( static_cast<flag_task*>(t.parent())->child_stolen ) {
00316             my_max_depth++;
00317             return true;
00318         } else return false;
00319     }
00320     void align_depth(depth_t base) {
00321         __TBB_ASSERT(base <= my_max_depth, 0);
00322         my_max_depth -= base;
00323     }
00324     depth_t max_depth() { return my_max_depth; }
00325 };
00326 
00328 class affinity_partition_type : public auto_partition_type_base<affinity_partition_type> {
00329     static const unsigned factor_power = 4;
00330     static const unsigned factor = 1<<factor_power;
00331     bool my_delay;
00332     unsigned map_begin, map_end, map_mid;
00333     tbb::internal::affinity_id* my_array;
00334     void set_mid() {
00335         unsigned d = (map_end - map_begin)/2; // we could add 1 but it is rather for LIFO affinity
00336         if( d > factor )
00337             d &= 0u-factor;
00338         map_mid = map_end - d;
00339     }
00340 public:
00341     affinity_partition_type( tbb::internal::affinity_partitioner_base_v3& ap ) {
00342         __TBB_ASSERT( (factor&(factor-1))==0, "factor must be power of two" ); 
00343         ap.resize(factor);
00344         my_array = ap.my_array;
00345         map_begin = 0;
00346         map_end = unsigned(ap.my_size);
00347         set_mid();
00348         my_delay = true;
00349         my_divisor /= __TBB_INITIAL_CHUNKS; // let excatly P tasks to be distributed across workers
00350         my_max_depth = factor_power+1; // the first factor_power ranges will be spawned, and >=1 ranges should left
00351         __TBB_ASSERT( my_max_depth < __TBB_RANGE_POOL_CAPACITY, 0 );
00352     }
00353     affinity_partition_type(affinity_partition_type& p, split)
00354         : auto_partition_type_base<affinity_partition_type>(p, split()), my_array(p.my_array) {
00355         __TBB_ASSERT( p.map_end-p.map_begin<factor || (p.map_end-p.map_begin)%factor==0, NULL );
00356         map_end = p.map_end;
00357         map_begin = p.map_end = p.map_mid;
00358         set_mid(); p.set_mid();
00359         my_delay = p.my_delay;
00360     }
00361     void set_affinity( task &t ) {
00362         if( map_begin<map_end )
00363             t.set_affinity( my_array[map_begin] );
00364     }
00365     void note_affinity( task::affinity_id id ) {
00366         if( map_begin<map_end ) 
00367             my_array[map_begin] = id;
00368     }
00369     bool check_for_demand( task &t ) {
00370         if( !my_delay ) {
00371             if( map_mid<map_end ) {
00372                 __TBB_ASSERT(my_max_depth>__TBB_Log2(map_end-map_mid), 0);
00373                 return true;// do not do my_max_depth++ here, but be sure my_max_depth is big enough
00374             }
00375             if( static_cast<flag_task*>(t.parent())->child_stolen ) {
00376                 my_max_depth++;
00377                 return true;
00378             }
00379         } else my_delay = false;
00380         return false;
00381     }
00382     bool divisions_left() { // part of old should_execute_range()
00383         return my_divisor > 1;
00384     }
00385     bool should_create_trap() {
00386         return true; // TODO: rethink for the stage after memorizing level
00387     }
00388     static const unsigned range_pool_size = __TBB_RANGE_POOL_CAPACITY;
00389 };
00390 
00391 class auto_partition_type: public auto_partition_type_base<auto_partition_type> {
00392 public:
00393     auto_partition_type( const auto_partitioner& ) {}
00394     auto_partition_type( auto_partition_type& src, split)
00395       : auto_partition_type_base<auto_partition_type>(src, split()) {}
00396     static const unsigned range_pool_size = __TBB_RANGE_POOL_CAPACITY;
00397 };
00398 
00399 class simple_partition_type: public partition_type_base<simple_partition_type> {
00400 public:
00401     simple_partition_type( const simple_partitioner& ) {}
00402     simple_partition_type( const simple_partition_type&, split ) {}
00404     template<typename StartType, typename Range>
00405     void execute(StartType &start, Range &range) {
00406         while( range.is_divisible() )
00407             split_work( start );
00408         start.run_body( range );
00409     }
00410     //static const unsigned range_pool_size = 1; - not necessary because execute() is overridden
00411 };
00412 
00414 class old_auto_partition_type: public tbb::internal::partition_type_base {
00415     size_t num_chunks;
00416     static const size_t VICTIM_CHUNKS = 4;
00417 public:
00418     bool should_execute_range(const task &t) {
00419         if( num_chunks<VICTIM_CHUNKS && t.is_stolen_task() )
00420             num_chunks = VICTIM_CHUNKS;
00421         return num_chunks==1;
00422     }
00423     old_auto_partition_type( const auto_partitioner& )
00424       : num_chunks(internal::get_initial_auto_partitioner_divisor()*__TBB_INITIAL_CHUNKS/4) {}
00425     old_auto_partition_type( const affinity_partitioner& )
00426       : num_chunks(internal::get_initial_auto_partitioner_divisor()*__TBB_INITIAL_CHUNKS/4) {}
00427     old_auto_partition_type( old_auto_partition_type& pt, split ) {
00428         num_chunks = pt.num_chunks = (pt.num_chunks+1u) / 2u;
00429     }
00430 };
00431 
00432 } // namespace interfaceX::internal
00434 } // namespace interfaceX
00435 
00437 
00439 class simple_partitioner {
00440 public:
00441     simple_partitioner() {}
00442 private:
00443     template<typename Range, typename Body, typename Partitioner> friend class serial::interface6::start_for;
00444     template<typename Range, typename Body, typename Partitioner> friend class interface6::internal::start_for;
00445     template<typename Range, typename Body, typename Partitioner> friend class interface6::internal::start_reduce;
00446     template<typename Range, typename Body, typename Partitioner> friend class internal::start_scan;
00447     // backward compatibility
00448     class partition_type: public internal::partition_type_base {
00449     public:
00450         bool should_execute_range(const task& ) {return false;}
00451         partition_type( const simple_partitioner& ) {}
00452         partition_type( const partition_type&, split ) {}
00453     };
00454     // new implementation just extends existing interface
00455     typedef interface6::internal::simple_partition_type task_partition_type;
00456 };
00457 
00459 
00462 class auto_partitioner {
00463 public:
00464     auto_partitioner() {}
00465 
00466 private:
00467     template<typename Range, typename Body, typename Partitioner> friend class serial::interface6::start_for;
00468     template<typename Range, typename Body, typename Partitioner> friend class interface6::internal::start_for;
00469     template<typename Range, typename Body, typename Partitioner> friend class interface6::internal::start_reduce;
00470     template<typename Range, typename Body, typename Partitioner> friend class internal::start_scan;
00471     // backward compatibility
00472     typedef interface6::internal::old_auto_partition_type partition_type;
00473     // new implementation just extends existing interface
00474     typedef interface6::internal::auto_partition_type task_partition_type;
00475 };
00476 
00478 class affinity_partitioner: internal::affinity_partitioner_base_v3 {
00479 public:
00480     affinity_partitioner() {}
00481 
00482 private:
00483     template<typename Range, typename Body, typename Partitioner> friend class serial::interface6::start_for;
00484     template<typename Range, typename Body, typename Partitioner> friend class interface6::internal::start_for;
00485     template<typename Range, typename Body, typename Partitioner> friend class interface6::internal::start_reduce;
00486     template<typename Range, typename Body, typename Partitioner> friend class internal::start_scan;
00487     // backward compatibility - for parallel_scan only
00488     typedef interface6::internal::old_auto_partition_type partition_type;
00489     // new implementation just extends existing interface
00490     typedef interface6::internal::affinity_partition_type task_partition_type;
00491 };
00492 
00493 } // namespace tbb
00494 
00495 #if defined(_MSC_VER) && !defined(__INTEL_COMPILER)
00496     #pragma warning (pop)
00497 #endif // warning 4244 is back
00498 #undef __TBB_INITIAL_CHUNKS
00499 #undef __TBB_RANGE_POOL_CAPACITY
00500 #undef __TBB_INIT_DEPTH
00501 #endif /* __TBB_partitioner_H */

Copyright © 2005-2011 Intel Corporation. All Rights Reserved.

Intel, Pentium, Intel Xeon, Itanium, Intel XScale and VTune are registered trademarks or trademarks of Intel Corporation or its subsidiaries in the United States and other countries.

* Other names and brands may be claimed as the property of others.