[Midnightbsd-cvs] mports: magus.pl: Magus 2.1: Many old bugs fixed and replaced with new
ctriv at midnightbsd.org
ctriv at midnightbsd.org
Mon Sep 22 14:28:09 EDT 2008
Log Message:
-----------
Magus 2.1: Many old bugs fixed and replaced with new exciting bugs.
Modified Files:
--------------
mports/Tools/magus/slave:
magus.pl (r1.24 -> r1.25)
-------------- next part --------------
Index: magus.pl
===================================================================
RCS file: /home/cvs/mports/Tools/magus/slave/magus.pl,v
retrieving revision 1.24
retrieving revision 1.25
diff -L Tools/magus/slave/magus.pl -L Tools/magus/slave/magus.pl -u -r1.24 -r1.25
--- Tools/magus/slave/magus.pl
+++ Tools/magus/slave/magus.pl
@@ -35,49 +35,19 @@
use strict;
use warnings;
-use lib qw(/usr/mports/Tools/lib);
+use FindBin;
+use lib "$FindBin::Bin/../../lib";
use Magus;
use Sys::Syslog;
-use POSIX qw(setsid);
+use POSIX qw(setsid :sys_wait_h);
use Getopt::Std qw(getopts);
use File::Path qw(rmtree);
-use Fcntl qw(:flock);
-$SIG{INT} = sub { report('info', "$$: caught sigint"); die "Caught SIGINT $$\n" };
+# Load some things before we chroot.
+use YAML::Dumper;
-my @origARGV = @ARGV;
-my $self = "$Magus::Config{SlaveMportsDir}/Tools/magus/slave/magus.pl";
-my $Lock;
-my $WorkerID = 1;
-
-
-while (1) {
- eval {
- main();
- exit(0);
- };
-
- if ($@) {
- # Check ping in case a dropped DB caused some other exception.
- if ($@ =~ m/DBI/ || !Magus::DBI->ping) {
- while (1) {
- report(err => "Could not connect to database ($@) waiting $Magus::Config{'DoneWaitPeriod'} seconds");
- sleep($Magus::Config{'DoneWaitPeriod'});
- last if Magus::DBI->ping;
- }
-
- if ($Lock) {
- $Lock->port->reset;
- $Lock->delete;
- }
- # back up to the main() call we go.
- } else {
- die $@;
- }
- }
-}
=head1 magus.pl
@@ -98,171 +68,194 @@
Verbose mode, print lots of status information to stdout.
-=back
-
-=head2 main()
+=item -j <n>
-Top level function. Sets up logging and starts the main loop. The general
-logic is to keep looping. If we have an active run, we want to work on ports
-from that run, as long as there are ports.
+Test n ports in parallel. Defaults to 1.
=cut
-my %opts;
+our @origARGV = @ARGV;
+our $Self = "$Magus::Config{SlaveMportsDir}/Tools/magus/slave/magus.pl";
-sub main {
- my $lock;
-
- getopts('fvj:', \%opts);
+our %Children;
+our $Children = 0;
+our @DeadChildren;
+our %WorkerIDs;
+
+$SIG{CHLD} = sub {
+ my $pid;
+
+ while (($pid = waitpid(-1, WNOHANG)) > 0) {
+ # we don't care about children that aren't magus.pls (make and what not also go thru this).
+ my $info = delete $Children{$pid} || return;
+ $Children--;
+ $WorkerIDs{$info->{worker_id}} = 1;
+ push(@DeadChildren, {lock => $info->{lock}, pid => $pid});
+ }
+};
+
+$SIG{INT} = sub { warn "Parent caught SIGINT"; kill_children(); exit 0 };
+
+
+=pod debug
+
+sub X {
+ use Data::Dumper;
- daemonize() unless $opts{f};
+ print Dumper({PID => $$, ChildrenCnt => $Children, Children => \%Children, Dead => \@DeadChildren, IDs => \%WorkerIDs});
+}
+
+=cut
+
+sub X { }
+
+our (%opts, $run);
+getopts('fvj:', \%opts);
- if ($opts{j}) {
- while ($opts{j} > 1) {
- my $pid = fork;
-
- if ($pid) {
- report(debug => "Forked child: $pid");
- $opts{j}--;
- next;
- } elsif (defined $pid) {
- $WorkerID++;
- last;
- } else {
- die "Unable to fork child: $!\n";
+$opts{j} ||= 1;
+%WorkerIDs = map { $_ => 1 } 1 .. $opts{j};
+
+report('info', "Starting magus on %s (%s)", $Magus::Machine->name, $Magus::Machine->arch);
+
+clean_up_reports_dir();
+
+daemonize() unless $opts{f};
+
+while (1) {
+ eval { main() };
+
+ if ($@) {
+ my $error = $@;
+
+ # Check ping in case a dropped DB caused some other exception.
+ if (($@ =~ m/lost\s+connection/i || m/can't\s+connect/i || m/server\s+shutdown/i || m/gone\s+away/i) || !Magus::DBI->ping) {
+ while (1) {
+ report(err => "Could not connect to database ($@) waiting $Magus::Config{'LostDBWaitPeriod'} seconds");
+ sleep($Magus::Config{'LostDBWaitPeriod'});
+ if (Magus::DBI->ping) {
+ last;
+ }
}
+
+ # back up to the main() call we go.
+ } else {
+ die $error;
}
- }
-
- report('info', "Starting magus on %s (%s)", $Magus::Machine->name, $Magus::Machine->arch);
-
- while (1) {
- my $run = get_current_run($Magus::Machine);
-
- if (!$run || !($lock = Magus::Lock->get_ready_lock($run))) {
- # there's no more ports to test, sleep for a while and check again.
- report('debug', "No ports to build, sleeping $Magus::Config{DoneWaitPeriod} seconds.");
- sleep($Magus::Config{DoneWaitPeriod});
- next;
- }
-
- # stick the current lock in the global, so we can flush it if need be.
- $Lock = $lock;
-
- report('info', "Starting test run for: %s", $lock->port);
-
- run_test($lock);
-
- report('info', 'Run completed for: %s', $lock->port);
-
- $lock->delete;
- undef $Lock;
}
}
-=head3 Exiting
-
-Note that all the locks associated with this machine will be deleted at
-script exit.
+sub main {
+ my $parentPID = $$;
+ # This isn't right yet. XXX
+
+ MAIN: while (1) {
+ if (@DeadChildren) {
+ process_dead_children();
+ }
-=cut
+ while ($Children < $opts{j}) {
+ X();
+ my $lock;
+ my $run = get_current_run();
+ if (!$run || !($lock = Magus::Lock->get_ready_lock($run))) {
+ # there's no more ports to test, sleep for a while and check again.
+ X();
+ report(debug => "No ports to build, sleeping $Magus::Config{DoneWaitPeriod} seconds.");
+ sleep($Magus::Config{DoneWaitPeriod});
+ next MAIN;
+ }
-END {
- if ($Lock) {
- $Lock->delete;
+
+ eval { start_child($lock); };
+ if ($@) {
+ report(debug => "Unhandled child exception: $@\n");
+ exit(0) if $$ != $parentPID;
+ }
+ }
+
+ sleep;
}
}
-=head2 run_test($lock)
-Run the entire test process on the given port (well, the given lock, but the
-port is at C<< $lock->port >>).
+=head2 start_child($lock)
+
+Start the child off.
=cut
-sub run_test {
+sub start_child {
my ($lock) = @_;
- #
- # we have a few eval blocks here, because we want to be sure that the
- # exception will not allow the child to go to the main program logic.
- #
- my ($port, $chroot);
-
- $lock->port->note_event(info => "Test started.");
-
- eval {
- $port = $lock->port;
- $chroot = Magus::Chroot->new(
- workerid => $WorkerID,
- tarball => $Magus::Config{ChrootTarBall},
- );
-
- copy_dep_pkgfiles($lock, $chroot);
- };
- if (my $error = $@) {
- handle_exception($error, $lock);
- return;
- }
+ my $worker_id = (keys %WorkerIDs)[0] || die "No worker IDs\n";
+ delete $WorkerIDs{$worker_id};
+
+ # XXX Block signals
+ my $pid = fork;
- # we fork so just the child chroots, then we can get out of the chroot.
- my $pid = fork();
if ($pid) {
- # Parent, we wait for the child to finish, and if we get sigint
- # while we are waiting, we stop the child and then cleanup
- local $SIG{INT} = sub {
- waitpid($pid, 0);
- handle_exception("Caught SIGINT", $lock);
- exit(1);
- };
-
- waitpid($pid, 0);
- } elsif (defined $pid) {
- # child here; chroot and test the port
- eval {
- $chroot->do_chroot();
- chdir($port->origin);
-
- my $test = Magus::PortTest->new(port => $port, chroot => $chroot);
- report('info', "Building $port");
- my $results = $test->run;
+ report(info => "Forked child worker $pid");
- insert_results($port, $results);
- };
+ # parent return
+ $Children{$pid} = { lock => $lock, worker_id => $worker_id };
+ $Children++;
+
+ X();
+ return;
+ } elsif (defined($pid)) {
+ $SIG{INT} = sub { die "CAUGHT SIGINT\n"; };
+ $SIG{CHLD} = 'DEFAULT';
+
+ eval { init_results_file(); };
- if ($@ && $@ !~ m/DBI/) {
- handle_exception($@, $lock);
+ if ($@) {
+ # not good.
+ report(alert => "Unable to init reporting system for child! $@");
+ exit(72);
}
+
+ eval { run_test($lock, $worker_id); };
+
+ handle_exception($@, $lock) if $@;
exit(0);
} else {
- die "Could not fork: $!\n";
- }
+ die "Couldn't fork: $!\n";
+ }
+}
- # Back to the parent here
- if ($? == 0) {
- # update our port object with new data from the database, as the child
- # process probably changed stuff
- $port->refresh;
-
- eval {
- if ($port->status eq 'pass' || $port->status eq 'warn') {
- upload_pkgfile($port, $chroot);
- }
- };
+=head2 run_test($lock)
+
+Run the entire test process on the given port (well, the given lock, but the
+port is at C<< $lock->port >>).
+
+=cut
+
+sub run_test {
+ my ($lock, $worker_id) = @_;
+
+ my $port = $lock->port;
+ $port->note_event(info => "Test started.");
+
+ my $chroot = Magus::Chroot->new(
+ workerid => $worker_id,
+ tarball => $Magus::Config{ChrootTarBall},
+ );
+
+ copy_dep_pkgfiles($lock, $chroot);
+
+ $chroot->do_chroot();
+ chdir($port->origin);
- if ($@) {
- handle_exception($@, $lock);
- }
- } else {
- die "Child exited unexpectantly: $?\n";
- }
+ my $test = Magus::PortTest->new(port => $port, chroot => $chroot);
+ report('info', "Building $port");
+ my $results = $test->run;
- $port->note_event($port->status => "Test complete.");
+ store_results($results);
}
@@ -346,7 +339,6 @@
=cut
-
sub insert_results {
my ($port, $results) = @_;
@@ -412,7 +404,7 @@
my ($format, @args) = @msg;
my $time = localtime;
- printf "[$time] ($$:$WorkerID): $format\n", @args;
+ printf "[$time] ($$): $format\n", @args;
}
}
}
@@ -429,19 +421,102 @@
die $error if $error =~ m/DBI/;
- if ($error =~ m/SIGINT/) {
- $lock->port->reset;
+ return if $error =~ m/SIGINT/;
+
+ report('err', "Exception thrown building %s: %s", $lock->port, $error);
+
+ store_results({ exception => $error });
+}
+
+
+
+=head2 init_results_file()
+
+Prepares the results file for writing. Run this before you chroot.
+
+=cut
+
+{
+ my $fh; # this var is private to init_results_file() and store_results()
+ sub init_results_file {
+ my $file = "$Magus::Config{SlaveResultsDir}/$$";
+
+ if (! -d $Magus::Config{SlaveResultsDir} && !mkdir($Magus::Config{SlaveResultsDir})) {
+ die "Couldn't mkdir $Magus::Config{SlaveResultsDir}: $!";
+ }
+
+ open($fh, '>', $file) || die "Couldn't open $file: $!";
- report('debug', 'Exiting 0 from SIGINT (prior result for %s deleted).', $lock->port);
- exit 0;
+ return;
}
-
- report('err', "Exception throw building %s: %s", $lock->port, $error);
+
+
+=head3 store_results($ref)
+
+Dump the hashref into a file that the parent can find.
+
+=cut
+
+ sub store_results {
+ my ($ref) = @_;
+
+ my $yaml = YAML::Dump($ref);
+
- $lock->port->set_result_internal(internal => ExceptionThrown => "Internal exception thrown: $error");
+ print $fh $yaml;
+ close($fh);
+ undef $fh;
+ }
}
+=head3 process_dead_children(@remains)
+
+Takes a list of hash containing information about dead child
+processes and takes action on that information.
+
+=cut
+
+sub process_dead_children {
+ while (@DeadChildren) {
+ my $corpse = shift @DeadChildren;
+
+ my $port = $corpse->{lock}->port;
+ my $file = "$Magus::Config{SlaveResultsDir}/$corpse->{pid}";
+
+ my $results;
+
+ # if there's no file, then there are no results to store, the child
+ # probably exited from SIGINT, or maybe a dropped DB.
+ if (-e $file) {
+ eval {
+ $results = YAML::LoadFile($file)
+ };
+
+ # unlink($file) || report(err => "Couldn't unlink $file: $!);
+
+ if ($@) {
+ report(err => "Unable to load YAML results file (pid: $corpse->{pid}): $@");
+ $port->note_event(internal => "Unable to load YAML results file: $@");
+ $port->status('internal');
+ $port->update;
+ } elsif ($results->{exception}) {
+ report(err => "Exception from pid $corpse->{pid}: $results->{exception}");
+ $port->note_event(internal => $results->{exception});
+ $port->status('internal');
+ $port->update;
+ } else {
+ insert_results($port, $results);
+ }
+ }
+
+ $corpse->{lock}->delete;
+ }
+}
+
+
+
+
=head3 get_current_run()
Check to see if the current run is the latest, and update it if it isn't.
@@ -452,27 +527,16 @@
my $current = Magus::Run->latest($Magus::Machine) || return;
my $tree_id = get_tree_id("$Magus::Config{'SlaveDataDir'}/mports") || 0;
- # we need up update the mports tree to match the new run, but we need
- # to make sure that no other worker is using it, and that we have an
- # exclusive lock to change it.
+ #report(debug => "latest run: $current, tree id: $tree_id, machine id: " . $Magus::Machine->run);
+
if ($current != $Magus::Machine->run || $tree_id != $current) {
- return if Magus::Lock->search(machine => $Magus::Machine);
-
- my $lockfile = "$Magus::Config{SlaveDataDir}/mports.lock";
-
- # we need to make sure that no other process is updating the mports dir.
- open(my $lock, '>>', $lockfile) || die "Couldn't open $lockfile: $!\n";
- unless (flock($lock, LOCK_EX|LOCK_NB)) {
- report(debug => "Unable to lock mports directory (perhaps another process is updating it?)");
- # this will block until the file is unlocked.
- flock($lock, LOCK_EX) || die "Couldn't lock $lockfile: $!\n";
-
- report(debug => "Reloading self.");
- exec($self, @origARGV);
+ local $SIG{CHLD} = 'DEFAULT';
+ if ($Children) {
+ report(debug => "Children active, not moving to new run yet.");
return;
}
-
+
$Magus::Machine->run($current);
$Magus::Machine->update;
@@ -486,7 +550,7 @@
chdir($dir) || die "Couldn't chdir to $dir: $!\n";
if (system("/usr/bin/fetch -p $tarball") != 0) {
- die "Couldn't fetch $tarball";
+ die "Couldn't fetch $tarball: (exit $?)";
}
report(debug => "Deleting old $dir/mports");
@@ -498,7 +562,7 @@
unlink($current->tarball);
report(debug => "Reloading self.");
- exec($self, @origARGV);
+ exec($Self, @origARGV);
}
return $current;
@@ -516,5 +580,46 @@
return $id;
}
+
+=head2 kill_children()
+
+Kill all the children we have.
+
+=cut
+
+sub kill_children {
+ if ($Children) {
+ local $SIG{CHLD} = 'DEFAULT';
+
+ warn "Killing children @{[keys %Children ]}\n";
+ X();
+ kill INT => keys %Children;
+
+ # make sure that we wait until all the kids are dead.
+ my $kid;
+ do {
+ $kid = waitpid(-1, WNOHANG);
+ } until $kid > 0;
+ }
+
+ foreach my $lock (Magus::Lock->search(machine => $Magus::Machine)) {
+ $lock->port->reset;
+ $lock->delete;
+ }
+}
+
+
+=head2 clean_up_reports_dir()
+
+Nuke everything in the reports dir.
+
+=cut
+
+sub clean_up_reports_dir {
+ local $SIG{CHLD} = 'DEFAULT';
+ system("rm $Magus::Config{SlaveResultsDir}/*");
+}
+
+
1;
__END__
More information about the Midnightbsd-cvs
mailing list