binary treeの再帰をループに展開したら遅くなった

http://shootout.alioth.debian.org/u32/which-programming-languages-are-fastest.php
このベンチマークサイトみてたらRubyがえらい遅かったので、なんとかならないかなーと思い、適当に手を出したのがbinary-treeのプログラム

# The Computer Language Shootout Benchmarks
# http://shootout.alioth.debian.org
#
# contributed by Jesse Millikan
# Modified by Wesley Moxam


def item_check(left, item, right)
  return item if left.nil?
  item + item_check(*left) - item_check(*right)
end

def bottom_up_tree(item, depth)
  return [nil, item, nil] unless depth > 0
  item_item = 2 * item
  depth -= 1
  [bottom_up_tree(item_item - 1, depth), item, bottom_up_tree(item_item, depth)]
end

max_depth = ARGV[0].to_i
min_depth = 4

max_depth = min_depth + 2 if min_depth + 2 > max_depth

stretch_depth = max_depth + 1
stretch_tree = bottom_up_tree(0, stretch_depth)

puts "stretch tree of depth #{stretch_depth}\t check: #{item_check(*stretch_tree)}"
stretch_tree = nil

long_lived_tree = bottom_up_tree(0, max_depth)

min_depth.step(max_depth + 1, 2) do |depth|
  iterations = 2**(max_depth - depth + min_depth)

  check = 0

  (1..iterations).each do |i|
    temp_tree = bottom_up_tree(i, depth)
    check += item_check(*temp_tree)

    temp_tree = bottom_up_tree(-i, depth)
    check += item_check(*temp_tree)
  end

  puts "#{iterations * 2}\t trees of depth #{depth}\t check: #{check}"
end

puts "long lived tree of depth #{max_depth}\t check: #{item_check(*long_lived_tree)}"

これをjrubyでプロファイルしたらこうなった

Total time: 27.76

     total        self    children       calls  method
----------------------------------------------------------------
     24.28        0.00       24.28           1  Numeric#step
     24.28        0.26       24.02           7  Range#each
     15.18        0.11       15.07    29578590  Object#item_check
     10.22        0.08       10.14    29578590  Object#bottom_up_tree
      2.05        0.15        1.90          79  Kernel#require
      1.48        1.48        0.00    14701928  Kernel#nil?
      1.32        1.32        0.00    14876684  NilClass#nil?
      0.74        0.00        0.74          15  Kernel#require
      0.23        0.01        0.22          53  Object.method_added
      0.22        0.00        0.22         111  Object.method_added

どうもitem_checkとbottom_up_treeの再帰呼び出しが重いらしい。
これをループとスタックで書きなおしてやれば早くなるんじゃないかと試してみた。

def item_check(left, item, right)
  branch = 0
  stack = []
  while true
    case branch
    when 0
      until left.nil?
        stack.push [1, left, item, right]
        left, item, right = *left
      end
    when 1
      stack.push [2, left, item, right]
      left, item, right = *right
      branch = 0
      next
    else
      item = item + left[1] - right[1]
    end
    
    return item if stack.length == 0
    branch, left, item, right = *stack.pop
  end
end

def bottom_up_tree(item, depth)
  branch = 0
  stack = []
  tr = tree = [nil, item, nil]
  while true
    case branch
    when 0
      until depth == 0
        stack.push [1, tree, depth]
        tree[0] = [nil, 2 * tree[1] - 1, nil]
        tree = tree[0]
        depth -= 1
      end
    when 1
      stack.push [2, tree, depth]
      tree[2] = [nil, 2 * tree[1], nil]
      tree = tree[2]
      depth -= 1
      branch = 0
      next
    end

    return tr if stack.length == 0
    branch, tree, depth = *stack.pop
  end
end

max_depth = ARGV[0].to_i
min_depth = 4

max_depth = min_depth + 2 if min_depth + 2 > max_depth

stretch_depth = max_depth + 1
stretch_tree = bottom_up_tree(0, stretch_depth)

puts "stretch tree of depth #{stretch_depth}\t check: #{item_check(*stretch_tree)}"
stretch_tree = nil

long_lived_tree = bottom_up_tree(0, max_depth)

min_depth.step(max_depth + 1, 2) do |depth|
  iterations = 2**(max_depth - depth + min_depth)

  check = 0

  (1..iterations).each do |i|
    temp_tree = bottom_up_tree(i, depth)
    check += item_check(*temp_tree)

    temp_tree = bottom_up_tree(-i, depth)
    check += item_check(*temp_tree)
  end

  puts "#{iterations * 2}\t trees of depth #{depth}\t check: #{check}"
end

puts "long lived tree of depth #{max_depth}\t check: #{item_check(*long_lived_tree)}"

結果は↓のとおり

Total time: 87.17

     total        self    children       calls  method
----------------------------------------------------------------
     82.77        0.00       82.77           1  Numeric#step
     82.77        0.29       82.48           7  Range#each
     45.59       27.18       18.41      174754  Object#bottom_up_tree
     39.29       23.22       16.08      174754  Object#item_check
      7.92        7.92        0.00    58807672  Array#push
      7.83        7.83        0.00    88211509  Array#[]
      6.38        6.38        0.00    58807672  Array#pop
      6.21        6.21        0.00    59157180  Array#length
      2.98        2.98        0.00    29403836  Array#[]=
      1.95        0.13        1.82          79  Kernel#require
      1.67        1.67        0.00    14876684  NilClass#nil?
      1.50        1.50        0.00    14701928  Kernel#nil?
      0.87        0.00        0.87          15  Kernel#require
      0.26        0.01        0.25          53  Object.method_added
      0.25        0.00        0.25         111  Object.method_added

むしろすごく遅くなってしまった。スタック操作に再帰処理以上の時間がかかってしまっている