create_initial_runs(input_file_name, run_size, num_ways) {
allocate a dynamic array, a, large enough to accommodate runs of
size run_size
open the input file using the fields package
for i = 0 to NUM_WAYS-1 {
open output_scratch_file i using fopen
}
end_of_input = false
next_output_file = 0
num_runs_per_output_file = 0
/* instantiate the heap */
for i = 1 to run_size {
read a record into a[i]
if (end of input)
end_of_input = true
break
/* indicate the record belongs in the current run and insert the
record into the heap */
a[i]->mark = current_mark;
upheap(i);
}
/* initialize the heap size */
N = i - 1;
/* create the initial runs--get_item sets end_of_input to true only
when there is no more input and the heap is exhausted */
while (end_of_input == false) {
/* keep outputting records to the current run until get_item returns
a NULL record */
for (record = get_item(input_file, &end_of_input); record != NULL;
record = get_item(input_file, &end_of_input)) {
write record to output_scratch_files[next_output_file]
}
output the sentinel value to scratch_output_file[next_output_file]
/* everytime we get back to the first output file, increment the
number of runs per output file by 1 */
if (next_output_file == 0)
num_runs_per_output_file = num_runs_per_output_file + 1
next_output_file = (next_output_file + 1) % num_ways
}
/* make sure the same number of runs are assigned to each scratch
output file */
if (next_output_file != 0) {
for i = next_output_file to (num_ways - 1) {
output the sentinel value to scratch_output_file[i]
}
}
for i = 0 to (num_ways -1)
close scratch_output_file[i] using fclose
close the input file using the fields package
return num_runs_per_output_file
}
There is also a global variable:
current_mark: Indicates which of the two values a record's mark field must
be set to in order to be included in this run.
Here is the pseudo-code:
/* if the two records both belong in the run or both don't belong in the
run, compare the two values and return true if the first value is less
than the first. Otherwise, return true if the first record belongs in
the run, and false otherwise (in the latter case, the second record
belongs in the run and hence it is "less than" the first record). */
int less_than_or_equal(record1, record2) {
if (record1->mark == record2->mark)
return (record1->key <= record2->key)
else
return (record1->mark == current_mark)
}
downheap (int k) {
int j;
Record v;
v = a[k];
while (k <= N/2) {
j = 2 * k;
if (j < N && less_than_or_equal(a[j+1], a[j])) j++;
if (less_than_or_equal(v, a[j])) break;
a[k] = a[j];
k = j;
}
a[k] = v;
}
int heap_empty () {
return (N == 0);
}
/* get_item returns the next item in the run. It returns
NULL if the end of a run or end of the input is reached. The
flag end_of_input allows get_item to indicate whether the reason for
the NULL is due to the end of a run or the end of the input.
end_of_input must be a pointer to allow a value to be passed back
through the end_of_input parameter. */
PERSONNEL get_item(input_file, *end_of_input) {
static int more_input = true;
/* determine if the heap is empty. If it is, then the end of the
input has been reached, so set end_of_input to true */
if (heap_empty()) {
*end_of_input = true;
return NULL;
}
/* determine if the root element belongs in this run */
if (a[1]->mark != current_mark) {
/* reverse the mark so that all the elements in the heap will be
available for the next run */
current_mark = !(current_mark);
*end_of_input = false;
return NULL;
}
if (more_input) {
if (not end of file) {
read a record into a[0]
/* determine if the newly read record belongs in the current run
or the next run. */
if (a[0]->key >= a[1]->key)
a[0]->mark = current_mark
else
a[0]->mark = !(current_mark)
/* insert the newly read record into the heap by executing a
replace operation (i.e., the root of the heap will be returned
and the new value will be pushed onto the heap)
downheap(0);
return a[0];
}
else { /* once the input is exhausted, start returning items from the
heap */
more_input = false;
return remove_item();
}
}
else {
/* if the input has been exhausted, remove the top item on the heap
and return it */
return remove_item();
}
}
log[P](N / 2M) < 2 ==> N / 2M < 2**P ==> sqrt(N / 2M) < P