# --- ROCK-COPYRIGHT-NOTE-BEGIN --- # # This copyright note is auto-generated by ./scripts/Create-CopyPatch. # Please add additional copyright information _after_ the line containing # the ROCK-COPYRIGHT-NOTE-END tag. Otherwise it might get removed by # the ./scripts/Create-CopyPatch script. Do not edit this copyright text! # # ROCK Linux: rock-src/package/fake/xmltv/tv-grab-de-tvtoday-cvs.patch # ROCK Linux is Copyright (C) 1998 - 2004 Clifford Wolf # # This patch file is dual-licensed. It is available under the license the # patched project is licensed under, as long as it is an OpenSource license # as defined at http://www.opensource.org/ (e.g. BSD, X11) or under the terms # of the GNU General Public License as published by the Free Software # Foundation; either version 2 of the License, or (at your option) any later # version. # # --- ROCK-COPYRIGHT-NOTE-END --- --- ./grab/de_tvtoday/tv_grab_de_tvtoday.in.orig 2004-09-15 15:41:16.423279656 +0200 +++ ./grab/de_tvtoday/tv_grab_de_tvtoday.in 2004-09-15 15:47:54.423774384 +0200 @@ -17,7 +17,7 @@ [--days N] [--offset N] [--quiet] [--slow] [--nosqueezeout] -tv_grab_de_tvtoday --list-channels +tv_grab_de_tvtoday --list-channels [--icons] =head1 DESCRIPTION @@ -62,6 +62,10 @@ B<--list-channels> write output giving elements for every channel available (ignoring the config file), but no programmes. +B<--icons> get the URL for channel-logos together with the channel-list. +Mind that this takes a long time, since a webpage has to be requested for +every channel. + B<--help> print a help message and exit. =head1 SEE ALSO @@ -83,7 +87,7 @@ use warnings; use strict; use Date::Manip; -use XMLTV::Version '$Id: tv_grab_de_tvtoday.in,v 1.13 2004/05/09 17:49:11 epaepa Exp $ '; +use XMLTV::Version '$Id: tv_grab_de_tvtoday.in,v 1.19 2004/07/17 14:45:34 stesie Exp $ '; use Getopt::Long; use HTML::TreeBuilder; use HTML::Entities; @@ -101,7 +105,7 @@ To grab data: $0 [--config-file FILE] [--output FILE] [--days N] [--offset N] [--quiet] [--slow] [--nosqueezeout] -Channel List: $0 --list-channels +Channel List: $0 --list-channels [--icons] END ; @@ -123,6 +127,7 @@ sub squeeze_out_desc($$); sub refine_category_attr($$); sub get_channels(); +sub get_icons(); sub channel_id($); sub split_up_names($$); sub parse_date_data($); @@ -133,7 +138,7 @@ sub refine_credits($); #-- Category-Matching RegExp -our constant $category_regexp = '^(.*?\s+)?((?:[\wäöüßÄÖÜ-]+-?)?(?:[Aa]genten|[Cc]harts|[Dd]oku(?:mentar|mentation)?|Episoden|[Dd]rama|[Kk]rimi|[Kk]omödie|[Ll]iteratur|[Mm]agazin|[Mm]elodram|[Pp]orträt|[Rr]eportage|[Rr]eihe|[Ss]oap|[Ss]atire|[Ss]erie|[Ss]tudie|[Tt]alk|[Tt]hriller)-?(?:[Ff]ilm|[Mm]ovie|[Ss]how)?s?)([\s;,]+.*)?$'; +our constant $category_regexp = '^(.*?\s+)?((?:[\wäöüßÄÖÜ-]+-?)?(?:[Aa]genten|[Cc]harts|[Cc]omedy|[Dd]oku(?:mentar|mentation)?|Episoden|[Dd]rama|[Kk]rimi|[Kk]omödie|[Ll]iteratur|[Mm]agazin|[Mm]elodram|[Pp]orträt|[Rr]eportage|[Rr]eihe|[Ss]oap|[Ss]atire|[Ss]erie|[Ss]tudie|[Tt]alk|[Tt]hriller|Wunschclip)-?(?:[Ff]ilm|[Mm]ovie|[Ss]how)?s?)([\s;,]+.*)?$'; #-- DEBUG FLUFF ... my $debug = 0; @@ -166,6 +171,7 @@ my $opt_slow = 0; my $opt_nosqueeze = 0; my $opt_list_channels; +my $opt_icons = 0; my $opt_help; my $opt_share; @@ -179,6 +185,7 @@ 'slow' => \$opt_slow, 'nosqueezeout' => \$opt_nosqueeze, 'list-channels' => \$opt_list_channels, + 'icons' => \$opt_icons, 'help' => \$opt_help, 'share=s' => \$opt_share, ) or usage(0); @@ -222,6 +229,11 @@ #-- hey, we can't live without channel data, so let's get that now! my %channels = get_channels(); +#-- if wanted, get the channel logos (only in list-channels-mode done here!) +my %icons; +%icons = get_icons() if $opt_icons && $opt_list_channels; + + # share/ directory for storing channel mapping files. This next line # is altered by processing through tv_grab_de_tvtoday.PL. But we can # use the current directory instead of share/tv_grab_de_tvtoday for @@ -331,8 +343,11 @@ if ($mode eq 'list-channels') { foreach (keys %channels) { - $writer->write_channel({'id'=>channel_id($_), - 'display-name'=>[[$channels{$_}, $lang]]}); + my %channel = ('id' => channel_id($_), + 'display-name' => [[$channels{$_}, $lang]]); + $channel{'icon'} = [{'src' => "http://www.tvtoday.de" . $icons{$_}}] + if(defined($icons{$_})); + $writer->write_channel(\%channel); } $writer->end(); @@ -346,14 +361,10 @@ die "No channels specified, run me with --configure flag\n" unless(scalar(@requests)); +#-- We need to wait with writing the channels, therefore buffer the program-infos +my @writebuffer; -#-- write out tags -foreach(@requests) { - $writer->write_channel({'id'=>channel_id($_), - 'display-name'=>[[$channels{$_}, $lang]]}); -} - -#-- write out tags +#-- get tags my $numdays = $opt_days + $opt_offset - 1; my $bar = new Term::ProgressBar('grabbing', scalar(@requests) * $opt_days) if Have_bar && not $opt_quiet; @@ -365,6 +376,18 @@ } } +#-- write out tags +foreach(@requests) { + my $id = channel_id($_); + my %channel = ('id' => $id, + 'display-name' => [[$channels{$_}, $lang]]); + $channel{'icon'} = [{'src' => "http://www.tvtoday.de" . $icons{$id}}] + if(defined($icons{$id})); + $writer->write_channel(\%channel); +} + +#-- write out tags +$writer->write_programme($_) foreach(@writebuffer); #-- hey, looks like we've finished ... $writer->end(); @@ -466,7 +489,9 @@ @el = $el[0]->content_list(); - $_ = shift @el; #-- in this column there's only the logo of the tv station, ignore that + $_ = shift @el; #-- in this column there's the logo of the tv station + $icons{$grab->{'channel'}} = $_->look_down('_tag' => 'img')->attr('src') + unless(exists($icons{$grab->{'channel'}})); $_ = shift @el; #-- there we should have the time when our show begins ... die "unable to extract time-information from html code, content:\n", $_->as_text() @@ -490,6 +515,10 @@ if (ref($span) eq "") { $span =~ s/\s*\([^\(]+\)\s*$//; + if ($span =~ s/\s*(\d+)\.\sTeil//gi) { + #- strip episode number from title field + $show{q(episode-num)} = [ [ $1, "onscreen" ] ]; + } $show{title} = [[ $span, $lang ]]; } elsif (ref($span) eq "HTML::Element" and $span->tag eq "a") { @@ -500,6 +529,10 @@ my $title = ($tag->content_list())[0]; $title =~ s/\s*\([^\(]+\)\s*$//; + if ($title =~ s/\s*(\d+)\.\sTeil//gi) { + #- strip episode number from title field + $show{q(episode-num)} = [ [ $1, "onscreen" ] ]; + } $show{title} = [[ $title, $lang ]]; } else { die } @@ -539,7 +572,36 @@ #-- okay, commit that data now ... $show{channel} = $grab->{channel}; - $writer->write_programme(\%show); + + #-- try to construct clumps, if necessary ... + if(defined($show{q(desc)}) + && $show{q(desc)}->[0][0] =~ m/^anschl\.\s+(.*)/) { + my $clumpname = $1; + + delete $show{q(desc)}; + $show{q(clumpidx)} = '0/2'; # first of two shows ... + push @writebuffer, \%show; + + my %newshow; + foreach(qw(start stop channel)) { $newshow{$_} = $show{$_}; } + $newshow{q(clumpidx)} = '1/2'; # second show ... + + #- $clumpname may contain a extra VPS start time ... + if($clumpname =~ s/\s+\(VPS ([012]?[0-9])\.([0-6][0-9])\)//) { + $newshow{q(vps-start)} = $newshow{q(start)}; + substr($newshow{"vps-start"}, 8, 4) = sprintf("%02d%02d", $1, $2); + } + + warn("title of clumped show contains problematic chars, please take care") + if($clumpname =~ m/[,;:\*]/); + + $newshow{q(title)} = [[ $clumpname, $lang ]]; + push @writebuffer, \%newshow; + } + else { + #-- common clumpless show, write out ... + push @writebuffer, \%show; + } last if($grab->{"lasttime"} >= 86400 && !$grab->{"lastday"}); } @@ -698,9 +760,10 @@ my $show = shift; my @newdesc; - #push(@newdesc, $show->{"desc"}->[0][0]) if($show->{"desc"}); - if(my @parts = ($$desc =~ m/^\s*(\(([^\)]*)\))?\s+([^,;]+)(,\s+([^,;]+)\s+([12][09][0-9]{2}(?:[\/-][0-9]{2})?))?\s*(?:; (Buch\/Regie|R): ([^;]+))?(; D: (.+))?\s*$/)) { + # try to match , ; R: ; D: construct + # where / or the [RD]: stuff may be missing ... + if(my @parts = ($$desc =~ m/^\s*(\(([^\)]*)\))?\s+([^,;]+)(,\s+([^,;]+)\s+([12][09][0-9]{2}(?:[\/-][0-9]{2})?))?\s*; (?:(?:; )?(Buch\/Regie|R): ([^;]+))?\s*((?:; )?D: (.+))?\s*$/)) { $$desc = ""; #-- $parts[1] is the show title in English (doesn't have to be available) @@ -806,14 +869,6 @@ next; } - if (s/\(VPS ([0-2][0-9])\.([0-5][0-9])\)//) { - # assume that vps begins on the same day as the actual show, - # thus simply overwrite the stored 'start' information - # might do trouble when daylight savings time begins/ends - $show->{"vps-start"} = $show->{"start"}; - substr($show->{"vps-start"}, 8, 6) = "$1$200"; - } - if (my ($type, $names) = m/^\s*(Reporter:|Moderation:|Kommentar:|Gast:|Gäste:|Mit|Film von)\s+(?!de[nm]\s+)(.*?)\s*$/) { $names =~ s/\s*u.a.\s*$//; $names =~ s/\([^\(\)]+\)//g; #-- remove all brackets, that further describe the person @@ -1002,10 +1057,45 @@ +#-- get channel logos +sub get_icons() { + my %icons; + my $url="http://www.tvtoday.de/tv/programm/programm.php?ztag=0&sparte=alle&uhrzeit=Ax00&sender="; + my $chan; + my $tag; + my $addr; + + my $bar = new Term::ProgressBar('grabbing icons', scalar(keys(%channels))) + if Have_bar && not $opt_quiet; + + foreach (keys %channels) { + my $tb = new HTML::TreeBuilder(); + $tb->parse(get_page($url.$_)); + $tag = $tb->look_down('_tag' => 'img', + sub { + return ($_[0]->attr('src') =~ m/^\/tv\/programm\/bilder\/senderlogos\//); + }); + + update $bar if Have_bar && not $opt_quiet; + + unless(ref($tag) eq "HTML::Element") { + $tb->delete; + next; + }; + + $icons{$_} = $tag->attr('src'); + $tb->delete; + } + + return %icons; +} + + + #-- get channel listing sub get_channels() { my %channels; - my $url="http://www.tvtoday.de/tv/programm/programm.php?ztag=0&sparte=alle&uhrzeit=Ax00&sender=ZDF"; + my $url="http://www.tvtoday.de/tv/programm/programm.php?ztag=0&sparte=alle&uhrzeit=Ax00&sender=alle"; my $tb=new HTML::TreeBuilder(); $tb->parse(get_page($url));