main(argc, argv) {
assign default values for run_size and num_ways
parse the switches and, if appropriate, assign the user-defined
values to run-size and num_ways
open the output file
create two arrays of scratch files
read the input file, create the initial runs, and assign the runs
to the scratch output files (create_initial_runs)
sort the runs using the sort_merge algorithm (sort_merge)
close the output file
}
create_initial_runs(input_file_name, run_size, num_ways) {
allocate a dynamic array, a, large enough to accommodate runs of
size run_size
open the input file
for i = 0 to NUM_WAYS-1 {
open output_scratch_file i
}
more_input = true
next_output_file = 0
num_runs_per_output_file = 0
while (more_input) {
for i = 1 to run_size { /* entry 0 in the array is reserved for
a possible sentinel value. If your
sorting algorithm does not require a
sentinel value, the for loop could
start at i = 0 and go to
(run_size - 1) */
if (not end_of_input_file)
read a record into a[i]
else {
more_input = false
break
}
}
sort array a using an in-memory algorithm like quicksort
/* write the records to the appropriate scratch output file
for j = 1 to i { /* can't assume that the loop runs to run_size
since the last run's length may be less than
run_size */
write a[i] to scratch_output_file[next_output_file]
}
output the sentinel value to scratch_output_file[next_output_file]
/* everytime we get back to the first output file, increment the
number of runs per output file by 1 */
if (next_output_file == 0)
num_runs_per_output_file = num_runs_per_output_file + 1
next_output_file = (next_output_file + 1) % num_ways
}
/* make sure the same number of runs are assigned to each scratch
output file */
if (next_output_file != 0) {
for i = next_output_file to (num_ways - 1) {
output the sentinel value to scratch_output_file[i]
}
}
for i = 0 to (num_ways -1)
close scratch_output_file[i]
close the input file
return num_runs_per_output_file
}
sortmerge(output_file, num_runs_per_scratch_output_file, num_ways) {
for (N = num_runs_per_scratch_output_file; N > 1;
N = ceiling(N / num_ways) /* ceiling is a function that rounds up
to the nearest integer. You need to
write this function yourself */
{
open_scratch_files() /* open input and output scratch files */
for i = 0 to (N-1) {
create_run(output_scratch_files[i % num_ways], true, num_ways);
}
/* make sure the same number of runs are assigned to each scratch
output file */
if ((i % num_ways) != 0) {
for j = (i % num_ways) to (num_ways - 1) {
output the sentinel value to scratch_output_file[j]
}
}
close_scratch_files(); /* close input and output scratch files */
}
/* make the last run write into the output file */
open_scratch_files();
create_run(output_file, false, num_ways);
close_scratch_files();
}
create_run(output_file, generate_sentinel_value_flag, num_ways) {
/* initialize the merge_array */
for i = 0 to (num_ways -1) {
read a record from input_scratch_file i to merge_array[i]
/* create the run */
while (true) {
find the minimum key and the index of that minimum key (min_index)
in merge_array
if (min == SENTINEL_VALUE) /* the run is complete if the sentinel value
break; is reached */
write the record with the minimum key in merge_array to output_file
read the next record from the input_scratch_file with index min_index
into merge_array[min_index]
}
if (generate_sentinel_value_flag == true)
write the sentinal value to the output file
}
The easiest way to handle this alternation is to maintain a flag that keeps track of which of your two file arrays is currently the input array. For example, you might declare a flag called input1, which if true indicates that your first array is the current input array and if false indicates that your second array is the current input array.
As an example of the use of this flag, here is the code for open_scratch_files:
void open_scratch_files () {
// if input1 is true, open the first bank of files for input and
// the second bank for output
if (input1) {
for (i = 0; i < num_ways; i++) {
filearray1[i].Open('i');
filearray2[i].Open('o');
}
}
// if input1 is false, open the first bank of files for output and
// the second bank for input
else {
for (i = 0; i < num_ways; i++) {
filearray1[i].Open('o');
filearray2[i].Open('i');
}
}
}
Performance
- If M records can be sorted in-memory and the file consists of
N records, then the number of initial runs is N / M.
- If there are 2P I/O units available, then the number of
subsequent passes is ceiling(logP(N / M)) since each
pass reduces the number of runs by P. Here ceiling(x) means
the smallest integer greater than or equal to x.