package crawl_books;
use strict;
use utf8;
use Encode;
use Encode::Guess qw/ euc-jp shiftjis 7bit-jis utf8 /;
$Encode::Guess::NoUTFAutoGuess = 1; # utf16 utf32 を候補から外す
use LWP::UserAgent;
use JSON;
use XML::Simple;
use Data::Dumper;
sub new{
my $invocant = shift;
my $class = ref($invocant) || $invocant;
my $args = shift; $args = {} if ! $args;
my ( $obj ) = bless {
cache_dir=>'bolivia/cache/',
cache_live=>60,
cache_max=>2000,
'calil'=>{
'url'=>'https://calil.jp/book/',
'ua'=>{
'mo'=>'Mozilla/5.0 (Linux; Android 7.0; PLUS Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.98 Mobile Safari/537.36',
'pc'=>'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:56.0) Gecko/20100101 Firefox/56.0',
},
},
'booklog'=>{
'url'=>'https://booklog.jp/item/1/',
'ua'=>{
'mo'=>'Mozilla/5.0 (Linux; Android 7.0; PLUS Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.98 Mobile Safari/537.36',
'pc'=>'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:56.0) Gecko/20100101 Firefox/56.0',
},
},
%$args,
@_
}, $class;
return $obj;
}
sub DESTROY {
my $self = shift;
return;
}
sub get_booksinfo{
my $self = shift;
my $args = shift;
$args->{word} =~ s! !!g;
my $book;
if($args->{word} =~ m!^[0-9X]+$!){
$book = $self->crawl_calil_book({isbn=>$args->{word}});
}
else{
$book = $self->crawl_booklog_book({isbn=>$args->{word}});
}
return $book;
}
#
sub isbn10{
my $self = shift;
my $args = shift;
return if ! $args->{isbn};
# B.+ for used books number
return $args->{isbn} if $args->{isbn} =~ m!^B[A-Z0-9]+! && length($args->{isbn}) == 10;
my $isbn = $args->{isbn};
$isbn =~ s![ -]!!g; $isbn =~ tr/0-9/0-9/;
my $dig;
my $type;
if( $isbn =~ /^978/ && length($isbn) == 13 ){
$type = 13;
$dig = chop($isbn);
my @buf = split('', $isbn);
my $odd; my $eve;
for ( my $i = 1; $i <= length($isbn); $i++ ){
if( $i%2 ){
$odd += $buf[$i-1]; # 奇数
}
else{
$eve += $buf[$i-1]; # 偶数
}
}
$eve = $eve * 3;
my $sum = $eve + $odd;
my $check = chop( $sum );
$check = 10-$check if( $check );
return 0 if $check ne $dig;
$isbn =~ s/^978//;
}
else{
$type = 10;
$dig = chop($isbn);
}
return 0 if( length($isbn) != 9 );
my @buf = split('', $isbn);
my $w = 10;
my $sum = 0;
foreach ( @buf ){ $sum += $_ * $w; --$w; }
my $check = ( 11-($sum % 11) );
$check = $check == 10 ? 'X' : $check == 11 ? '0' : $check;
if( $type == 10 ){
if( $check ne $dig ){ return 0; }else{ return $isbn . $dig; }
}
elsif( $type == 13 ){
return $isbn . $check;
}
}
#
# crawl & cache
#
sub crawl_calil_book{
my $self = shift;
my $args = shift;
return if ! $args->{isbn};
my $res = $self->crawl_request({asin=>$args->{isbn}, url=>$self->{calil}->{url}, ua=>$self->{calil}->{ua}->{pc}});
return if ! $res->is_success;
my $content = $res->content;
$content = Encode::decode('utf8' , $content) if ! utf8::is_utf8($content);
$content =~ s!\r?\n!!g;
my ($keywords) = $content =~ m!<meta name="keywords"[ ]+content="([^\"]+)" */>!;
my ($image) = $content =~ m!<meta property="og:image"[ ]+content="([^\"]+)" */>!;
return if ! $keywords;
my ($title) = $keywords =~ m!^([^\,]+),!;
$title = $self->clean_str($title,'title');
$keywords =~ s!.+$args->{isbn},?!!;
my @creators = split(/ *, */, $keywords);
my $reviews; # TODO
$image = '' if $image !~ m!https?://.+amazon\.com.+!;
my $size;
$size->{s} = $image ? $self->{Amazon}->{simage}->{width} : '';
$size->{m} = $image ? $self->{Amazon}->{mimage}->{width} : '';
$size->{l} = $image ? $self->{Amazon}->{limage}->{width} : '';
return [{
'asin'=>$args->{isbn},'title'=>$title,'creator'=>\@creators,
'simage'=>$image, 'simage_w' =>$size->{s},
'mimage'=>$image, 'mimage_w'=>$size->{m},
'limage'=>$image, 'limage_w'=>$size->{l},
'reviews'=>$reviews,
'crawl'=>'calil',
}];
}
sub crawl_booklog_book{
my $self = shift;
my $args = shift;
return if ! $args->{isbn};
my $res = $self->crawl_request({asin=>$args->{isbn}, url=>$self->{booklog}->{url}, ua=>$self->{booklog}->{ua}->{pc}});
return if ! $res->is_success;
my $content = $res->content;
$content = Encode::decode('utf8' , $content) if ! utf8::is_utf8($content);
$content =~ s!\r?\n!!g;
my ( $head ) = $content =~ m!<head[^>]*>(.+)</head>!;
my @buf = $head =~ m!(<script type="application/ld\+json">.+</script>)!g;
my $json_str = join('', @buf);
$json_str =~ s!<script type="application/ld\+json">!!g;
$json_str =~ s!</script>$!!;
my $info;
foreach my $json (split('</script>', $json_str)){
my $ref;
eval{ $ref = decode_json(Encode::encode('utf8',$json)) };
if( $@ ){
printf qq{%s<br />%s\n}, $json, $@;
next;
}
if( $ref->{'@type'} eq 'Book' ){
$info->{title} = $ref->{name};
$info->{image} = $ref->{thumbnailUrl};
if( ref($ref->{author}) eq 'ARRAY' ){
foreach ( @{ $ref->{author} } ){
if( $_->{'@type'} eq 'Person'){
push( @{ $info->{author} }, $_->{name});
}
}
}
last;
}
}
my $title = $self->clean_str($info->{title},'title');
my @words = @{ $info->{author} };
my $reviews; # TODO
my $image = $info->{image};
my $size;
$size->{s} = $image ? $self->{Amazon}->{simage}->{width} : '';
$size->{m} = $image ? $self->{Amazon}->{mimage}->{width} : '';
$size->{l} = $image ? $self->{Amazon}->{limage}->{width} : '';
return [{
'asin'=>$args->{isbn},'title'=>$title,'creator'=>\@words,
'simage'=>$image, 'simage_w' =>$size->{s},
'mimage'=>$image, 'mimage_w'=>$size->{m},
'limage'=>$image, 'limage_w'=>$size->{l},
'reviews'=>$reviews,
'crawl'=>'booklog',
}];
}
sub crawl_request{
my $self = shift;
my $args = shift;
my $url = $args->{url} . $args->{asin};
my $agent = ($args->{ua} || $self->{crawl}->{ua}->{mo}); # デフォルトはmo
my $ua = LWP::UserAgent->new(agent=>$agent);
my $req = HTTP::Request->new(GET=>$url);
my $res = $ua->request($req);
return $res;
}
# 2019/3/17
sub clear_cache_dir{
my $self = shift;
my $args = shift;
no strict 'refs';
my $fh = 'FH000';
++$fh while fileno($fh);
opendir($fh, $self->{root} . $self->{cache_dir});
my @buf = grep(!/^\./, readdir($fh));
closedir($fh);
return if @buf < $self->{cache_max};
my %files = map{$_=>(stat($self->{root} . $self->{cache_dir} . $_))[9]} @buf;
my @sorted = sort({$files{$a}<=>$files{$b}} @buf);
my $timestamp = time;
foreach ( @sorted ){
last if( ($timestamp - $files{$_}) < 60*60*24* $self->{cache_live} );
unlink($self->{root} . $self->{cache_dir} . $_);
}
return;
}
sub check_cache{
my $self = shift;
my $args = shift;
$args->{isbn} =~ s![ \-]!!g;
return if ! $args->{isbn};
$self->clear_cache_dir();
if(-f $self->{root} . $self->{cache_dir} . $args->{isbn}){
no strict 'refs';
my $fh = 'FH000';
++$fh while fileno($fh);
open($fh, $self->{root} . $self->{cache_dir} . $args->{isbn}) || die;
my $cache = <$fh>; # 冒頭の1行だけ
close($fh);
return if ! $cache; # 念のため
$cache =~ s!\r?\n!!;
$cache = Encode::decode('utf8', $cache);
my @w = split(/\t/, $cache);
my @creators = split('###', $w[1]);
my $image = $w[2];
$w[3] = sprintf qq{https://www.amazon.co.jp/exec/obidos/ASIN/%s/ongoldenpond-22/ref=nosim},$args->{isbn} if ! $w[3];
my $reviews; # TODO
# TODO 2019/03/21
$w[0] =~ s!&;!&!g;
$w[0] =~ s! *\([^\(]*(講談社|単行本|文庫|選書|新書|ノベル|ミステリ|シリーズ|アンソロジ|コミック|ワイド|モーニング|ムック|MOOK|comic|BOOK|Novel)[^\)]*\) *!!i;
$w[0] =~ s!\(白水Uブックス.+\)!!;
if($w[0] !~ m!\(! && $w[0] =~ m!\)$!){
$w[0] =~ s!\)$!!;
}
return [{
'title'=>$w[0], 'creator'=>\@creators, 'page'=>$w[3],
'simage'=>$image, 'simage_w' => $self->{Amazon}->{simage}->{width},
'mimage'=>$image, 'mimage_w'=>$self->{Amazon}->{mimage}->{width},
'limage'=>$image, 'limage_w'=>$self->{Amazon}->{limage}->{width},
'reviews'=>$reviews,
'asin'=>$args->{isbn},
}];
}
else{
my ($book, $max) = $self->get_booksinfo({'op'=>'asin','word'=>$args->{isbn}});
if( ! $book->[0]{title}){ # can't get bookinfo
unlink ($self->{root} . $self->{cache_dir} . $args->{isbn} );
return;
}
$book->[0]{limage} = '' if $book->[0]{limage} !~ m!https?://.+amazon\.com.+!;
my $cache = sprintf qq{%s\t%s\t%s\t%s}, $book->[0]{title}, join('###',@{$book->[0]{creator}}),$book->[0]{limage} , $book->[0]{page};
no strict 'refs';
my $fh = 'FH000';
++$fh while fileno($fh);
open($fh, '>' . $self->{root} . $self->{cache_dir} . $args->{isbn}) || die;
print $fh $cache;
close($fh);
return ($book,$max);
}
return;
}
#
# misc
#
sub clean_str{
my $self = shift;
my $str = shift;
my $flag = shift;
return if ! $str;
if( ref($str) ){
foreach (@{$str}){
$_ = Encode::decode('utf8', $_ ) if ! utf8::is_utf8($_);
s/\'/’/g; s/\,/,/g; s/\"/”/g; s/\;/;/g; s/\`/‘/g; s/\././g; s/\(/(/g; s/\)/)/g; s/^ +//g; s/ +$//g; s/\t//g;
}
}
else{
$str = Encode::decode('utf8', $str ) if ! utf8::is_utf8($str);
$str =~ s/\'/’/g; $str =~ s/\,/,/g; $str =~ s/\"/”/g; $str =~ s/\;/;/g; $str =~ s/\`/‘/g; $str =~ s/\././g; $str =~ s/^ +//g; $str =~ s/ +$//g; $str =~s/\t//g;
}
if( $flag eq 'title'){
my $dismiss = join('|', qw{
COMICS comics Comics Comix COMIX コミック コミックス
文庫 新書 選書 単行本 ノベルス ノベルズ NOVELS Novels novels NOVELS
モーニングKC イブニングKC アフタヌーンKC KCデラックス ワイドKC
ミステリーYA books BOOKS Books ブックス ミステリーランド 講談社BOX
扶桑社ミステリー ムック Mook mook MOOK シリーズ ENTERTAINMENT 新☆ハヤカワ・SF・シリーズ
} );
$str =~ s/\"//g;
$str =~ s/^ +//g; $str =~ s/ +$//g; $str =~ s/ +/ /g; $str =~ s/\t//g;
$str =~ s/\)\)$/\)/;
$str =~ s/ +\([^\)]*($dismiss)[^\)]*\) *$//;
$str =~ s/ +\([^\)]*($dismiss)[^\)]*\) *$//; # ダブリ...
$str =~ s/\(/(/g; $str =~ s/\)/)/g;
$str =~ s/\&[;;]/&/g;
}
return $str;
}
1;