|
|
@ -0,0 +1,282 @@ |
|
|
|
--- ./grab/de_tvtoday/tv_grab_de_tvtoday.in.orig 2004-09-15 15:41:16.423279656 +0200
|
|
|
|
+++ ./grab/de_tvtoday/tv_grab_de_tvtoday.in 2004-09-15 15:47:54.423774384 +0200
|
|
|
|
@@ -17,7 +17,7 @@
|
|
|
|
[--days N] [--offset N] |
|
|
|
[--quiet] [--slow] [--nosqueezeout] |
|
|
|
|
|
|
|
-tv_grab_de_tvtoday --list-channels
|
|
|
|
+tv_grab_de_tvtoday --list-channels [--icons]
|
|
|
|
|
|
|
|
=head1 DESCRIPTION |
|
|
|
|
|
|
|
@@ -62,6 +62,10 @@
|
|
|
|
B<--list-channels> write output giving <channel> elements for every |
|
|
|
channel available (ignoring the config file), but no programmes. |
|
|
|
|
|
|
|
+B<--icons> get the URL for channel-logos together with the channel-list.
|
|
|
|
+Mind that this takes a long time, since a webpage has to be requested for
|
|
|
|
+every channel.
|
|
|
|
+
|
|
|
|
B<--help> print a help message and exit. |
|
|
|
|
|
|
|
=head1 SEE ALSO |
|
|
|
@@ -83,7 +87,7 @@
|
|
|
|
use warnings; |
|
|
|
use strict; |
|
|
|
use Date::Manip; |
|
|
|
-use XMLTV::Version '$Id: tv_grab_de_tvtoday.in,v 1.13 2004/05/09 17:49:11 epaepa Exp $ ';
|
|
|
|
+use XMLTV::Version '$Id: tv_grab_de_tvtoday.in,v 1.19 2004/07/17 14:45:34 stesie Exp $ ';
|
|
|
|
use Getopt::Long; |
|
|
|
use HTML::TreeBuilder; |
|
|
|
use HTML::Entities; |
|
|
|
@@ -101,7 +105,7 @@
|
|
|
|
To grab data: $0 [--config-file FILE] [--output FILE] |
|
|
|
[--days N] [--offset N] |
|
|
|
[--quiet] [--slow] [--nosqueezeout] |
|
|
|
-Channel List: $0 --list-channels
|
|
|
|
+Channel List: $0 --list-channels [--icons]
|
|
|
|
END |
|
|
|
; |
|
|
|
|
|
|
|
@@ -123,6 +127,7 @@
|
|
|
|
sub squeeze_out_desc($$); |
|
|
|
sub refine_category_attr($$); |
|
|
|
sub get_channels(); |
|
|
|
+sub get_icons();
|
|
|
|
sub channel_id($); |
|
|
|
sub split_up_names($$); |
|
|
|
sub parse_date_data($); |
|
|
|
@@ -133,7 +138,7 @@
|
|
|
|
sub refine_credits($); |
|
|
|
|
|
|
|
#-- Category-Matching RegExp |
|
|
|
-our constant $category_regexp = '^(.*?\s+)?((?:[\wäöüßÄÖÜ-]+-?)?(?:[Aa]genten|[Cc]harts|[Dd]oku(?:mentar|mentation)?|Episoden|[Dd]rama|[Kk]rimi|[Kk]omödie|[Ll]iteratur|[Mm]agazin|[Mm]elodram|[Pp]orträt|[Rr]eportage|[Rr]eihe|[Ss]oap|[Ss]atire|[Ss]erie|[Ss]tudie|[Tt]alk|[Tt]hriller)-?(?:[Ff]ilm|[Mm]ovie|[Ss]how)?s?)([\s;,]+.*)?$';
|
|
|
|
+our constant $category_regexp = '^(.*?\s+)?((?:[\wäöüßÄÖÜ-]+-?)?(?:[Aa]genten|[Cc]harts|[Cc]omedy|[Dd]oku(?:mentar|mentation)?|Episoden|[Dd]rama|[Kk]rimi|[Kk]omödie|[Ll]iteratur|[Mm]agazin|[Mm]elodram|[Pp]orträt|[Rr]eportage|[Rr]eihe|[Ss]oap|[Ss]atire|[Ss]erie|[Ss]tudie|[Tt]alk|[Tt]hriller|Wunschclip)-?(?:[Ff]ilm|[Mm]ovie|[Ss]how)?s?)([\s;,]+.*)?$';
|
|
|
|
|
|
|
|
#-- DEBUG FLUFF ... |
|
|
|
my $debug = 0; |
|
|
|
@@ -166,6 +171,7 @@
|
|
|
|
my $opt_slow = 0; |
|
|
|
my $opt_nosqueeze = 0; |
|
|
|
my $opt_list_channels; |
|
|
|
+my $opt_icons = 0;
|
|
|
|
my $opt_help; |
|
|
|
my $opt_share; |
|
|
|
|
|
|
|
@@ -179,6 +185,7 @@
|
|
|
|
'slow' => \$opt_slow, |
|
|
|
'nosqueezeout' => \$opt_nosqueeze, |
|
|
|
'list-channels' => \$opt_list_channels, |
|
|
|
+ 'icons' => \$opt_icons,
|
|
|
|
'help' => \$opt_help, |
|
|
|
'share=s' => \$opt_share, |
|
|
|
) or usage(0); |
|
|
|
@@ -222,6 +229,11 @@
|
|
|
|
#-- hey, we can't live without channel data, so let's get that now! |
|
|
|
my %channels = get_channels(); |
|
|
|
|
|
|
|
+#-- if wanted, get the channel logos (only in list-channels-mode done here!)
|
|
|
|
+my %icons;
|
|
|
|
+%icons = get_icons() if $opt_icons && $opt_list_channels;
|
|
|
|
+
|
|
|
|
+
|
|
|
|
# share/ directory for storing channel mapping files. This next line |
|
|
|
# is altered by processing through tv_grab_de_tvtoday.PL. But we can |
|
|
|
# use the current directory instead of share/tv_grab_de_tvtoday for |
|
|
|
@@ -331,8 +343,11 @@
|
|
|
|
|
|
|
|
if ($mode eq 'list-channels') { |
|
|
|
foreach (keys %channels) { |
|
|
|
- $writer->write_channel({'id'=>channel_id($_),
|
|
|
|
- 'display-name'=>[[$channels{$_}, $lang]]});
|
|
|
|
+ my %channel = ('id' => channel_id($_),
|
|
|
|
+ 'display-name' => [[$channels{$_}, $lang]]);
|
|
|
|
+ $channel{'icon'} = [{'src' => "http://www.tvtoday.de" . $icons{$_}}]
|
|
|
|
+ if(defined($icons{$_}));
|
|
|
|
+ $writer->write_channel(\%channel);
|
|
|
|
} |
|
|
|
|
|
|
|
$writer->end(); |
|
|
|
@@ -346,14 +361,10 @@
|
|
|
|
die "No channels specified, run me with --configure flag\n" unless(scalar(@requests)); |
|
|
|
|
|
|
|
|
|
|
|
+#-- We need to wait with writing the channels, therefore buffer the program-infos
|
|
|
|
+my @writebuffer;
|
|
|
|
|
|
|
|
-#-- write out <channel> tags
|
|
|
|
-foreach(@requests) {
|
|
|
|
- $writer->write_channel({'id'=>channel_id($_),
|
|
|
|
- 'display-name'=>[[$channels{$_}, $lang]]});
|
|
|
|
-}
|
|
|
|
-
|
|
|
|
-#-- write out <programme> tags
|
|
|
|
+#-- get <programme> tags
|
|
|
|
my $numdays = $opt_days + $opt_offset - 1; |
|
|
|
my $bar = new Term::ProgressBar('grabbing', scalar(@requests) * $opt_days) |
|
|
|
if Have_bar && not $opt_quiet; |
|
|
|
@@ -365,6 +376,18 @@
|
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
+#-- write out <channel> tags
|
|
|
|
+foreach(@requests) {
|
|
|
|
+ my $id = channel_id($_);
|
|
|
|
+ my %channel = ('id' => $id,
|
|
|
|
+ 'display-name' => [[$channels{$_}, $lang]]);
|
|
|
|
+ $channel{'icon'} = [{'src' => "http://www.tvtoday.de" . $icons{$id}}]
|
|
|
|
+ if(defined($icons{$id}));
|
|
|
|
+ $writer->write_channel(\%channel);
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+#-- write out <program> tags
|
|
|
|
+$writer->write_programme($_) foreach(@writebuffer);
|
|
|
|
|
|
|
|
#-- hey, looks like we've finished ... |
|
|
|
$writer->end(); |
|
|
|
@@ -466,7 +489,9 @@
|
|
|
|
|
|
|
|
@el = $el[0]->content_list(); |
|
|
|
|
|
|
|
- $_ = shift @el; #-- in this column there's only the logo of the tv station, ignore that
|
|
|
|
+ $_ = shift @el; #-- in this column there's the logo of the tv station
|
|
|
|
+ $icons{$grab->{'channel'}} = $_->look_down('_tag' => 'img')->attr('src')
|
|
|
|
+ unless(exists($icons{$grab->{'channel'}}));
|
|
|
|
|
|
|
|
$_ = shift @el; #-- there we should have the time when our show begins ... |
|
|
|
die "unable to extract time-information from html code, content:\n", $_->as_text() |
|
|
|
@@ -490,6 +515,10 @@
|
|
|
|
|
|
|
|
if (ref($span) eq "") { |
|
|
|
$span =~ s/\s*\([^\(]+\)\s*$//; |
|
|
|
+ if ($span =~ s/\s*(\d+)\.\sTeil//gi) {
|
|
|
|
+ #- strip episode number from title field
|
|
|
|
+ $show{q(episode-num)} = [ [ $1, "onscreen" ] ];
|
|
|
|
+ }
|
|
|
|
$show{title} = [[ $span, $lang ]]; |
|
|
|
} |
|
|
|
elsif (ref($span) eq "HTML::Element" and $span->tag eq "a") { |
|
|
|
@@ -500,6 +529,10 @@
|
|
|
|
|
|
|
|
my $title = ($tag->content_list())[0]; |
|
|
|
$title =~ s/\s*\([^\(]+\)\s*$//; |
|
|
|
+ if ($title =~ s/\s*(\d+)\.\sTeil//gi) {
|
|
|
|
+ #- strip episode number from title field
|
|
|
|
+ $show{q(episode-num)} = [ [ $1, "onscreen" ] ];
|
|
|
|
+ }
|
|
|
|
$show{title} = [[ $title, $lang ]]; |
|
|
|
} |
|
|
|
else { die } |
|
|
|
@@ -539,7 +572,36 @@
|
|
|
|
|
|
|
|
#-- okay, commit that data now ... |
|
|
|
$show{channel} = $grab->{channel}; |
|
|
|
- $writer->write_programme(\%show);
|
|
|
|
+
|
|
|
|
+ #-- try to construct clumps, if necessary ...
|
|
|
|
+ if(defined($show{q(desc)})
|
|
|
|
+ && $show{q(desc)}->[0][0] =~ m/^anschl\.\s+(.*)/) {
|
|
|
|
+ my $clumpname = $1;
|
|
|
|
+
|
|
|
|
+ delete $show{q(desc)};
|
|
|
|
+ $show{q(clumpidx)} = '0/2'; # first of two shows ...
|
|
|
|
+ push @writebuffer, \%show;
|
|
|
|
+
|
|
|
|
+ my %newshow;
|
|
|
|
+ foreach(qw(start stop channel)) { $newshow{$_} = $show{$_}; }
|
|
|
|
+ $newshow{q(clumpidx)} = '1/2'; # second show ...
|
|
|
|
+
|
|
|
|
+ #- $clumpname may contain a extra VPS start time ...
|
|
|
|
+ if($clumpname =~ s/\s+\(VPS ([012]?[0-9])\.([0-6][0-9])\)//) {
|
|
|
|
+ $newshow{q(vps-start)} = $newshow{q(start)};
|
|
|
|
+ substr($newshow{"vps-start"}, 8, 4) = sprintf("%02d%02d", $1, $2);
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ warn("title of clumped show contains problematic chars, please take care")
|
|
|
|
+ if($clumpname =~ m/[,;:\*]/);
|
|
|
|
+
|
|
|
|
+ $newshow{q(title)} = [[ $clumpname, $lang ]];
|
|
|
|
+ push @writebuffer, \%newshow;
|
|
|
|
+ }
|
|
|
|
+ else {
|
|
|
|
+ #-- common clumpless show, write out ...
|
|
|
|
+ push @writebuffer, \%show;
|
|
|
|
+ }
|
|
|
|
|
|
|
|
last if($grab->{"lasttime"} >= 86400 && !$grab->{"lastday"}); |
|
|
|
} |
|
|
|
@@ -698,9 +760,10 @@
|
|
|
|
my $show = shift; |
|
|
|
my @newdesc; |
|
|
|
|
|
|
|
- #push(@newdesc, $show->{"desc"}->[0][0]) if($show->{"desc"});
|
|
|
|
|
|
|
|
- if(my @parts = ($$desc =~ m/^\s*(\(([^\)]*)\))?\s+([^,;]+)(,\s+([^,;]+)\s+([12][09][0-9]{2}(?:[\/-][0-9]{2})?))?\s*(?:; (Buch\/Regie|R): ([^;]+))?(; D: (.+))?\s*$/)) {
|
|
|
|
+ # try to match <category>, <country> <year>; R: <names>; D: <names> construct
|
|
|
|
+ # where <country>/<year> or the [RD]: stuff may be missing ...
|
|
|
|
+ if(my @parts = ($$desc =~ m/^\s*(\(([^\)]*)\))?\s+([^,;]+)(,\s+([^,;]+)\s+([12][09][0-9]{2}(?:[\/-][0-9]{2})?))?\s*; (?:(?:; )?(Buch\/Regie|R): ([^;]+))?\s*((?:; )?D: (.+))?\s*$/)) {
|
|
|
|
$$desc = ""; |
|
|
|
|
|
|
|
#-- $parts[1] is the show title in English (doesn't have to be available) |
|
|
|
@@ -806,14 +869,6 @@
|
|
|
|
next; |
|
|
|
} |
|
|
|
|
|
|
|
- if (s/\(VPS ([0-2][0-9])\.([0-5][0-9])\)//) {
|
|
|
|
- # assume that vps begins on the same day as the actual show,
|
|
|
|
- # thus simply overwrite the stored 'start' information
|
|
|
|
- # might do trouble when daylight savings time begins/ends
|
|
|
|
- $show->{"vps-start"} = $show->{"start"};
|
|
|
|
- substr($show->{"vps-start"}, 8, 6) = "$1$200";
|
|
|
|
- }
|
|
|
|
-
|
|
|
|
if (my ($type, $names) = m/^\s*(Reporter:|Moderation:|Kommentar:|Gast:|Gäste:|Mit|Film von)\s+(?!de[nm]\s+)(.*?)\s*$/) { |
|
|
|
$names =~ s/\s*u.a.\s*$//; |
|
|
|
$names =~ s/\([^\(\)]+\)//g; #-- remove all brackets, that further describe the person |
|
|
|
@@ -1002,10 +1057,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
+#-- get channel logos
|
|
|
|
+sub get_icons() {
|
|
|
|
+ my %icons;
|
|
|
|
+ my $url="http://www.tvtoday.de/tv/programm/programm.php?ztag=0&sparte=alle&uhrzeit=Ax00&sender=";
|
|
|
|
+ my $chan;
|
|
|
|
+ my $tag;
|
|
|
|
+ my $addr;
|
|
|
|
+
|
|
|
|
+ my $bar = new Term::ProgressBar('grabbing icons', scalar(keys(%channels)))
|
|
|
|
+ if Have_bar && not $opt_quiet;
|
|
|
|
+
|
|
|
|
+ foreach (keys %channels) {
|
|
|
|
+ my $tb = new HTML::TreeBuilder();
|
|
|
|
+ $tb->parse(get_page($url.$_));
|
|
|
|
+ $tag = $tb->look_down('_tag' => 'img',
|
|
|
|
+ sub {
|
|
|
|
+ return ($_[0]->attr('src') =~ m/^\/tv\/programm\/bilder\/senderlogos\//);
|
|
|
|
+ });
|
|
|
|
+
|
|
|
|
+ update $bar if Have_bar && not $opt_quiet;
|
|
|
|
+
|
|
|
|
+ unless(ref($tag) eq "HTML::Element") {
|
|
|
|
+ $tb->delete;
|
|
|
|
+ next;
|
|
|
|
+ };
|
|
|
|
+
|
|
|
|
+ $icons{$_} = $tag->attr('src');
|
|
|
|
+ $tb->delete;
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ return %icons;
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
#-- get channel listing |
|
|
|
sub get_channels() { |
|
|
|
my %channels; |
|
|
|
- my $url="http://www.tvtoday.de/tv/programm/programm.php?ztag=0&sparte=alle&uhrzeit=Ax00&sender=ZDF";
|
|
|
|
+ my $url="http://www.tvtoday.de/tv/programm/programm.php?ztag=0&sparte=alle&uhrzeit=Ax00&sender=alle";
|
|
|
|
|
|
|
|
my $tb=new HTML::TreeBuilder(); |
|
|
|
$tb->parse(get_page($url)); |