|
|
--- ./grab/de_tvtoday/tv_grab_de_tvtoday.in 2006/04/21 17:17:21 1.42
+++ ./grab/de_tvtoday/tv_grab_de_tvtoday.in 2006/05/25 17:10:18 1.45 HEAD
@@ -99,7 +99,7 @@
use warnings; use strict; -use XMLTV::Version '$Id: tv_grab_de_tvtoday.in,v 1.42 2006/04/21 17:17:21 stesie Exp $ ';
+use XMLTV::Version '$Id: tv_grab_de_tvtoday.in,v 1.45 2006/05/25 17:10:18 stesie Exp $ ';
use XMLTV::Capabilities qw/baseline manualconfig cache share/; use XMLTV::Description 'Germany (www.tvtoday.de)'; use Date::Manip; @@ -138,6 +138,7 @@
else { *t = \&Log::TraceMessages::t; *d = \&Log::TraceMessages::d; + #$Log::TraceMessages::On = 1;
} } @@ -502,9 +503,10 @@
} #-- extract date of grabbed data from retrieved webpage ... - $_ = $page->look_down('_tag' => 'td', 'class' => 'navigator-hhead-large');
+ $_ = $page->look_down('_tag' => 'span', 'class' => 'text-weiss');
die("cannot find date on requested page") unless($_->as_text() =~ m/([1-3]?[0-9])\.(1?[0-9])\.(20[0-9]{2})/); + t "extracted date: $3-$2-$1";
$day = ParseDate("$3-$2-$1 00:00:00"); #-- well, now let's scan the table for programme data @@ -573,6 +575,7 @@
$show{q(episode-num)} = [ [ $1, "onscreen" ] ]; } + t "show title: $span";
$show{title} = [[ $span, $lang ]]; } elsif (ref($span) eq "HTML::Element" and $span->tag eq "a") { @@ -583,7 +586,7 @@
my $title = ($tag->content_list())[0]; - $title = convert_cp1252_chars(\$title);
+ convert_cp1252_chars(\$title);
$title =~ s/\s*\([^\(]+\)\s*$//; if ($title =~ s/\s*(\d+)\.\sTeil//gi) { @@ -836,6 +839,7 @@
# try to match <category>, <country> <year>; R: <names>; D: <names> construct # where <country>/<year> or the [RD]: stuff may be missing ... if(my @parts = ($$desc =~ m/^\s*(\(([^\)]*)\))?\s+([^,;0-9]+)(,?\s+([^,;]+)\s+([12][09][0-9]{2}(?:[\/-][0-9]{2})?))?\s*; (?:(?:; )?(Buch\/Regie|R): ([^;]+))?\s*((?:; )?D: (.+))?\s*$/)) { + t "split rule: <category>, <country> <year> ...";
$$desc = ""; #-- $parts[1] is the show title in English (doesn't have to be available) @@ -887,9 +891,14 @@
} } else { + t "split rule: dot splitting";
my @data = split "·", $$desc; s/(^\s|\s$)//g foreach(@data); #CHG# + for(0 .. (scalar(@data) - 1)) {
+ t "dot-split part $_: " . $data[$_];
+ }
+
if(scalar(@data) == 3 && not($data[1] =~ m/[\wäöüßÄÖÜ]+:/) #- FIX false positive: tvtoday.de seems to publish "guests: <names>" here some (rare) times :-( && $data[2] =~ m/^Mit (.*?)$/) { @@ -945,15 +954,22 @@
next; } - if (my ($cat, $rest1, $names, $guests, $rest2) = m/^([^,]+?)((?:\s+-\s+..+?)*) - Moderation: (.+?) - Gäste: (..+?)(?:\s+-\s+(.+))?$/) {
+ if (my ($nocat, $cat, $rest1, $names, $guests, $rest2) = m/^(([^,.%^&*();]+?)((?:\s+-\s+..+?)*)|.+) - Moderation: (.+?) - Gäste: (..+?)(?:\s+-\s+(.+))?$/) {
my @data = split_up_names($names, $show); push @{$show->{"credits"}{"presenter"}}, @data; my @guest_data = split_up_names($guests, $show); push @{$show->{"credits"}{"guest"}}, @guest_data; - $show->{"category"} = [[ $cat, $lang ]];
-
- warn "misdetected category: $cat"
- if($cat =~ m/\d{4}/);
+
+ if(defined($cat)) {
+ $show->{"category"} = [[ $cat, $lang ]];
+
+ warn "misdetected category: $cat"
+ if($cat =~ m/\d{4}/);
+ }
+ else {
+ t "no-cat match: $nocat";
+ $rest1 = $nocat;
+ }
my @rest; foreach(defined($rest1) ? split(m/\s+-\s+/, $rest1) : undef, $rest2) { @@ -963,14 +979,21 @@
next unless length($_); } - if (my ($cat, $rest1, $names, $rest2) = m/^([^,]+?)((?:\s+-\s+..+?)*) - Moderation: (.+?)(?:\s+-\s+(.+))?$/) {
+ if (my ($nocat, $cat, $rest1, $names, $rest2) = m/^(([^,]+?)((?:\s+-\s+..+?)*)|.+) - Moderation: (.+?)(?:\s+-\s+(.+))?$/) {
my @data = split_up_names($names, $show); push @{$show->{"credits"}{"presenter"}}, @data; - $show->{"category"} = [[ $cat, $lang ]];
- warn "misdetected category: $cat"
- if($cat =~ m/\d{4}/);
-
+ if(defined($cat)) {
+ $show->{"category"} = [[ $cat, $lang ]];
+
+ warn "misdetected category: $cat"
+ if($cat =~ m/\d{4}/);
+ }
+ else {
+ t "no-cat match: $nocat";
+ $rest1 = $nocat;
+ }
+
my @rest; foreach(defined($rest1) ? split(m/\s+-\s+/, $rest1) : undef, $rest2) { push @rest, $_ if(defined($_) && length($_));
|