#!/usr/bin/perl use warnings; # # This program takes a file called ./allocine.html and parses it # to generate three tab-separated text files: # - ./allocine-theater.txt # - ./allocine-movie.txt # - ./allocine-perf.txt # Those files can then be used almost as-is to inject in an SQL database. # # $Id: parse.perl,v 1.29 2011-06-20 21:37:42 audran Exp $ # # Copyright (c) Audran Le Baron 2003-2005 # ########################## # Program initialization # ########################## open (FILE_IN, 'allocine-theater.txt'); # Movie theaters (id, name, address, # district, metro, multiplexity) open (MOVIE, '>allocine-movie.txt'); # Movies (id, title) open (PERF, '>allocine-perf.txt'); # Performances (theater, movie, day, hour, version, ads) %state = ("Start", 0, "Theater", 1, "Address", 2, "Metro", 3, "Week_day", 4, "Movie", 5, "Perf_version", 6, "Perf_details", 7); my $current_state = $state {"Start"}; my $theater; # Movie theater name my $theater_id; # Movie theater allocine id my $multiplexity; # 2 = UGC, Gaumont and the like # 1 = MK2 # 0 = Others my $address; # Movie theater address my $district; # Movie theater district (1 to 20) my $metro; # Movie theater metro station my $title; # Movie title my $movie_id; # Movie allocine id my $week_day; # A week day's id (Wed = 0, ..., Tue = 6) my %hours; # Table of perf hours => ads durations my $version; # VO or VF my $DEBUG = 0; # Display debug messages (No = 0, Yes = 1) ####################### # Program subroutines # ####################### #### # parse_theater ($line) # -> tries to parse $line for a new movie theater # -> returns 1 upon success, 0 otherwise #### sub parse_theater { my $line = $_[0]; if ($line =~ /([^<]+)<\/a>/) { if ($1 eq "C") { # Theater in Paris $theater = $3; $theater_id = "C$2"; $multiplexity = 0; $address = ""; $district = "0"; $metro = ""; if ($theater =~ /^(Gaumont|UGC|Bienvenue Montparnasse|Bretagne)/) { $multiplexity = 2; } elsif ($theater =~ /^MK2/) { $multiplexity = 1; } $current_state = $state {"Theater"}; if ($DEBUG) { print "[$theater_id: $theater] -> Multiplexity = $multiplexity\n"; } } else { # Theater not in Paris => skip $current_state = $state {"Start"}; } return 1; } else { return 0; } } #### # parse_address ($line) # -> tries to parse $line for a movie theater address # -> returns 1 upon success, 0 otherwise #### sub parse_address { my $line = $_[0]; if ($line =~ / +(.+? 75[01]([0-2][0-9]) Paris)/) { $address = $1; $district = $2; if ($DEBUG) { print " $address [$district]\n"; } $current_state = $state {"Address"}; return 1; } else { return 0; } } #### # parse_metro ($line) # -> tries to parse $line for a movie theater metro station # -> returns 1 upon success, 0 otherwise #### sub parse_metro { my $line = $_[0]; if ($line =~ / +Métro /) { $line =~ s/.*Métro( et RER)? //; $line =~ s/\r//; $line =~ s/\..*//; $line =~ s/<\/.*//; $line =~ s/, bus .*//; $line =~ s/Ligne [^,]+, //; chomp ($line); $metro = $line; if ($DEBUG) { print " Métro: $metro\n"; } $current_state = $state {"Metro"}; return 1; } else { return 0; } } #### # parse_week_day ($line) # -> tries to parse $line for a new week day tab # -> returns 1 upon success, 0 otherwise #### sub parse_week_day { my $line = $_[0]; if ($line =~ /
/) { $week_day = $1; if ($DEBUG) { print " Day $week_day\n"; } $current_state = $state {"Week_day"}; return 1; } else { return 0; } } #### # parse_movie ($line) # -> tries to parse $line for a new movie # -> returns 1 upon success, 0 otherwise #### sub parse_movie { my $line = $_[0]; if ($line =~ /([^<]+)<\/a>/) { $movie_id = $1; $title = $2; $version = ""; %hours = (); # array of (hour:min, ads) if ($DEBUG) { print " [$movie_id] $title\n"; } $current_state = $state {"Movie"}; return 1; } else { return 0; } } #### # parse_perf_version ($line) # -> tries to parse $line for a new movie performance version (VO|VF) # -> returns 1 upon success, 0 otherwise #### sub parse_perf_version { my $line = $_[0]; if ($line =~ / En (VF|VO)<\/span>/) { if (defined ($1)) { $version = $1; } else { $version = "VO"; } if ($DEBUG) { print " Version : $version\n"; } $current_state = $state {"Perf_version"}; return 1; } else { return 0; } } #### # parse_perf_details ($line) # -> tries to parse $line for a new movie performance information (hours, ads) # -> returns 1 upon success, 0 otherwise #### sub parse_perf_details { my $line = $_[0]; if ($line =~ /
  • ]*>(.*?)<\/li><\/ul>/) { @perfs = split /<\/li>
  • /, $1; %hours = (); foreach (@perfs) { / 00:mm ; h:mm => 0h:mm $ads = "\\N"; if ($hour_2 =~ /([0-2]?[0-9]):([0-5][0-9])/) { $ads = (($1 - $time_hour) % 24) * 60 + ($2 - $time_minute); } $hours {$hour} = $ads; if ($DEBUG) { print " [$version] $hour (ads: $ads)\n"; } } $current_state = $state {"Perf_details"}; return 1; } else { return 0; } } ##################### # Program main loop # ##################### # # Reads lines one by one # while ($line = ) { my $match = 0; $last_state = $current_state; if ($current_state == $state {"Start"}) { # State: Start => Search for a theater $match = &parse_theater ($line); } elsif ($current_state == $state {"Theater"}) { # State: Theater => Search for an address, # or else a metro, # or else a week day, # or else a new theater $match = (&parse_address ($line) or &parse_metro ($line) or &parse_week_day ($line) or &parse_theater ($line)); } elsif ($current_state == $state {"Address"}) { # State: Address => Search for a metro, # or else a week day, # or else a new theater $match = (&parse_metro ($line) or &parse_week_day ($line) or &parse_theater ($line)); } elsif ($current_state == $state {"Metro"}) { # State: Metro => Search for a week day, # or else a new theater $match = (&parse_week_day ($line) or &parse_theater ($line)); } elsif ($current_state == $state {"Week_day"}) { # State: Week_day => Search for a movie, # or else a new week day, # or else a new theater $match = (&parse_movie ($line) or &parse_week_day ($line) or &parse_theater ($line)); } elsif ($current_state == $state {"Movie"}) { # State: Movie => Search for a perf_version # or else a perf_details, # or else a new movie, # or else a new week day # or else a new theater $match = (&parse_perf_version ($line) or &parse_perf_details ($line) or &parse_movie ($line) or &parse_week_day ($line) or &parse_theater ($line)); } elsif ($current_state == $state {"Perf_version"}) { # State: Perf_version => Search for a perf_details # or else a new movie, # or else a new week day # or else a new theater $match = (&parse_perf_details ($line) or &parse_movie ($line) or &parse_week_day ($line) or &parse_theater ($line)); } elsif ($current_state == $state {"Perf_details"}) { # State: Perf_details => Search for a new movie, # or else a new week day # or else a new theater $match = (&parse_perf_details ($line) or &parse_movie ($line) or &parse_week_day ($line) or &parse_theater ($line)); } if ($match) { if ($DEBUG && 0) { print " ### MATCH! ### Last state: $last_state ### Current state: $current_state ###\n"; } # Write output files if ($last_state < $state {"Week_day"} && $current_state == $state {"Week_day"}) { if ($DEBUG) { print " => THEATER: $theater_id\t$theater\t$address\t$district\t$metro\t$multiplexity\n"; } print THEATER "$theater_id\t$theater\t$address\t$district\t$metro\t$multiplexity\n"; } if ($current_state == $state {"Movie"}) { if ($DEBUG) { print " => MOVIE: $movie_id\t$title\n"; } print MOVIE "$movie_id\t$title\n"; } if ($current_state == $state {"Perf_details"}) { foreach $hour (keys (%hours)) { if ($DEBUG) { print " => PERF: $theater_id\t$movie_id\t$week_day\t$hour\t$version\t", $hours{$hour}, "\n"; } print PERF "$theater_id\t$movie_id\t$week_day\t$hour\t$version\t", $hours{$hour}, "\n"; } } } } ################ # Program exit # ################ close (FILE_IN); close (THEATER); close (MOVIE); close (PERF);