#!/usr/bin/perl
#
# Created by Peter Krumins (peter@catonmat.net, @pkrumins on twitter)
# www.catonmat.net -- good coders code, great coders reuse
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see .
#
use warnings;
use strict;
#
# This program was written as a part of "reddit media: intelligent fun online"
# website generator.
# This website can be viewed here: http://redditmedia.com
#
# See http://www.catonmat.net/designing-reddit-media-website for more info.
#
use Template; # for generating html pages from templates
use DBI;
use XML::RSS;
use POSIX;
use HTML::Entities;
use URI;
use URI::Escape;
use File::Basename;
use File::Find;
use File::Copy;
use File::Flock;
use List::Util 'max';
use open OUT => ':utf8';
use ThumbExtractor;
use ThumbMaker;
use ImageFinder;
# Print various debugging information to stderr
#
use constant DEBUG => 1;
# Path to lockfile to make sure 1 copy of this script is running at any time
#
use constant LOCK_FILE_PATH => '/mnt/evms/services/apache/wwwroot/redditmedia/locks/page_gen.lock';
# Number of items to display per page. note that the first page will always strech to
# 2*ITEMS_PER_PAGE - 1 items (read about page generating algorithm in generate_pages subroutine).
#
use constant ITEMS_PER_PAGE => 3;
# Number of items which appear in the feed
#
use constant ITEMS_PER_FEED => 15;
# Path to webservers root directory which it will serve static pages from.
#
use constant WWW_PATH => '/mnt/evms/services/apache/wwwroot/redditmedia/www';
# Path to html templates which will be used to make static pages.
#
use constant TEMPLATE_PATH => '/mnt/evms/services/apache/wwwroot/redditmedia/templates';
# Temporary path for outputting compiled pages, after the pages have been generated
# they will be atomically renamed() to WWW_PATH.
#
use constant OUTPUT_TMP_PATH => '/mnt/evms/services/apache/wwwroot/redditmedia/tmp.www';
# Path to sqlite database which stores entries and some information about last script run.
#
use constant DATABASE_PATH => '/mnt/evms/services/apache/wwwroot/redditmedia/db/media.db';
# Path to dir where the compiled (cached) entries will be stored.
#
use constant COMPILED_ENTRIES_PATH => '/mnt/evms/services/apache/wwwroot/redditmedia/compiled.entries';
# Path to already generated pages which once generated never change (except navigation).
# Read about algorithm in generate_pages subroutine's comments
#
use constant COMPILED_PAGES_PATH => '/mnt/evms/services/apache/wwwroot/redditmedia/compiled.pages';
# Path to single entry link directory. That is where static pages of
# /link///.html are stored.
#
use constant LINK_DIR => '/mnt/evms/services/apache/wwwroot/redditmedia/www/link'; # + /pictures or /videos
# To prevent filesystem bottlenecks, entry cache will be stored in a number of subdirs
# of COMPILED_ENTRIES_PATH. Based on entry ID, the compiled version will be stored in
# COMPILED_ENTRIES_PATH/(integer part of(id/ITEMS_PER_CACHE_DIR) * ITEMS_PER_CACHE_DIR).
# For example if ENTRIES_PER_DIR is 1000, then entry with id 25 will be stored
# in COMPILED_ENTRIES_PATH/0 dir, entry with id 1832 in COMPILED_ENTRIES_PATH/1000, etc.
#
# The same happens for cached image thumbnails in IMAGE_CACHE_PATH
#
use constant ITEMS_PER_CACHE_DIR => 1000;
# Most sites do not provide thumbnails, in this case we retrive the picture and
# cache it locally in IMAGE_CACHE_PATH
#
use constant IMAGE_CACHE_PATH => "/mnt/evms/services/apache/wwwroot/redditmedia/www/image.cache";
# The relative path to WWW when a cached icon is accessed from web server
#
use constant IMAGE_RELATIVE_WWW => "/image.cache";
# To generate thumbnails, ImageCacher.pm module needs netpbm executables.
# This constant defines path to them
#
use constant NETPBM_PATH => "/home/pkrumins/tmpinstall/netpbm-10.26.44/foobarbaz/bin";
lock_script();
my $regenerate = shift || 0; # if regenerate is set, all the pages will be regenerated!
# useful if html templates have changed
clear_cache() if $regenerate;
my $dbh = DBI->connect("dbi:SQLite:" . DATABASE_PATH, '', '', { RaiseError => 1 });
die $DBI::errstr unless $dbh;
try_create_status_db();
my %new_entries = get_new_entries('main'); # get all new entries
exit 0 unless keys %new_entries; # exit if no new entries
my $template = Template->new({
INCLUDE_PATH => TEMPLATE_PATH,
OUTPUT_PATH => OUTPUT_TMP_PATH,
ABSOLUTE => 1
});
# Get top users and top hosts
my @top_users = get_top_users();
my @top_hosts = get_top_hosts();
#
# First let's generate cache of the entries, they will be put in COMPILED_ENTRIES_PATH directory
# and named $id-$type.html, where $id is the id of primary key of the entry in database and
# $type is type of link (picture, pictures, video, videos, etc.).
# They will still contain [% entry.pos %] template variable which should be replaced
# with the correct position in the page (1, 2, 3, ..., etc).
#
# Also after each cache entry has been generated, create the
# ///title.html page to have something indexed in google
#
foreach my $entry_id (keys %new_entries) {
generate_entry_cache($new_entries{$entry_id});
generate_link($new_entries{$entry_id});
}
generate_pages('main', \%new_entries);
generate_pages('pictures', {get_new_entries('pictures')});
generate_pages('videos', {get_new_entries('videos')});
update_rss_feed();
# Now, do the atomic rename() of index pages
#
my @new_indexes = glob(OUTPUT_TMP_PATH . "/*.html");
rename $_ => WWW_PATH . '/' . basename($_) foreach @new_indexes;
update_status_db();
#
# lock_script
#
# Exclusively locks a file, so we had always 1 copy of script running at any
# given moment
#
sub lock_script {
my $ret = lock(LOCK_FILE_PATH, undef, 'nonblocking');
unless ($ret) {
print "Script already running. Quitting.\n";
exit 1;
}
}
#
# clear_cache
#
# Function deletes all *.html files in OUTPUT_TMP_PATH and
# COMPILED_{ENTRIES,PAGES}_PATH directories
#
sub clear_cache {
unlink glob(OUTPUT_TMP_PATH . "/*.html");
unlink glob(COMPILED_PAGES_PATH . "/*.html");
my @entry_sub_dirs = grep -d, glob(COMPILED_ENTRIES_PATH . "/*");
foreach (@entry_sub_dirs) {
unlink glob($_ . "/*.html");
}
}
#
# update_rss_feed
#
# Function takes last ITEMS_PER_FEED entries from the database and
# generates the RSS feed for the media
#
# TODO: separate this out into a template
#
sub update_rss_feed {
my $rss = XML::RSS->new(version => '2.0');
$rss->channel(
title => "reddit media: intelligent fun online",
link => "http://redditmedia.com",
description => "intelligent media from reddit.com",
language => "en",
copyright => "redditmedia.com (c) Peteris Krumins, the content (c) reddit.com",
webMaster => 'peter@catonmat.net',
managingEditor => 'peter@catonmat.net',
pubDate => "2007-08-20 20:00",
lastBuildDate => strftime("%Y-%m-%d %H:%M:%S", localtime),
generator => "redditmedia.com static page generator"
);
$rss->image(
title => "reddit media: intelligent fun online",
url => "http://redditmedia.com/logo.gif",
link => "http://redditmedia.com",
width => 120,
height => 40,
description => "reddit alien, meow"
);
my $last_entries_query = "SELECT * FROM reddit ORDER BY id DESC LIMIT " . ITEMS_PER_FEED;
my $last_entries = $dbh->selectall_hashref($last_entries_query, ['id']);
foreach my $id (sort { $b <=> $a } keys %$last_entries) {
$rss->add_item(
title => $last_entries->{$id}->{title},
permaLink => $last_entries->{$id}->{url},
comments => "http://reddit.com/info/$last_entries->{$id}->{reddit_id}/comments",
pubDate => $last_entries->{$id}->{date_added},
category => $last_entries->{$id}->{type},
dc => {
creator => "reddit.com"
}
);
}
$rss->save(WWW_PATH . "/feed.html");
}
#
# try_create_status_db
#
# Creates a status db if does not exist
#
sub try_create_status_db {
my $table_exists = 0;
my $tables_q = "SELECT name FROM sqlite_master WHERE type='table' AND name='reddit_status'";
my $res = $dbh->selectall_arrayref($tables_q);
if (defined $res and @$res) {
$table_exists = 1;
}
unless ($table_exists) {
my $create_db =<do($create_db);
}
}
#
# update_status_db
#
# Updates status information abour last run and last generated id.
#
sub update_status_db {
my $has_records = "SELECT * FROM reddit_status";
my $records = $dbh->selectall_arrayref($has_records);
my $last_run = strftime("%Y-%m-%d %H:%M:%S", localtime);
my $max_id = max keys %new_entries;
if (defined $records and @$records) {
# Update the status table
#
$dbh->do("UPDATE reddit_status SET last_id = '$max_id'");
$dbh->do("UPDATE reddit_status SET last_run = '$last_run'");
}
else {
# Insert new status
#
$dbh->do("INSERT INTO reddit_status (last_id, last_run) VALUES ('$max_id', '$last_run')");
}
}
#
# generate_pages
#
# Given a hashref of new entries and the page type, the function generates
# pages of given type and outputs them to OUTPUT_TMP_PATH directory.
#
# Function uses the generated cache entries.
#
sub generate_pages {
my ($page_type, $new_entries) = @_;
my @compiled_entries = get_compiled_entries($page_type);
my @index_pages = get_index_pages($page_type);
#
# I want to regenerate pages as little as possible, to keep things running quick.
# Here is the algorithm which splits the entries to pages and makes them never change.
# Only the first page changes at any time.
#
# Let T be total number of entries, IPP be items per page to display.
# The first page will have maximum 2 * IPP - 1 entries. Given T entries, it is first filled
# with IPP entries and then with T%IPP. IPP is now a divisor of the remaining
# number of entries T - (IPP + T%IPP).
#
# Now we just have to update the main page and offset other pages
#
my ($total_entries, $total_pages) = (scalar @compiled_entries, scalar @index_pages);
my $extra_entries = $total_entries % ITEMS_PER_PAGE; # number of extra entries on first page
my $first_page_entries = ITEMS_PER_PAGE + $extra_entries;
if ($first_page_entries > $total_entries) {
$first_page_entries = $total_entries;
}
# Generate first page (index.html or index-pictures.html, etc).
my @gen_entries;
for my $entry_idx (0 .. $first_page_entries - 1) {
push @gen_entries, {
file => $compiled_entries[$entry_idx],
pos => $total_entries - $entry_idx
}
}
generate_page($page_type, 1, $total_entries, \@gen_entries);
if ($total_pages <= 1 || $regenerate) {
# no existing index pages for this page_type
# generate all pages!
my $current_page = 2;
my $current_item = 1;
@gen_entries = ();
for my $entry_idx ($first_page_entries .. $#compiled_entries) {
push @gen_entries, {
file => $compiled_entries[$entry_idx],
pos => $total_entries - $entry_idx
};
if ($current_item % ITEMS_PER_PAGE == 0) {
generate_page($page_type, $current_page, $total_entries, \@gen_entries);
@gen_entries = ();
$current_page++;
}
$current_item++;
}
}
else {
# Generate only the new pages.
# We determine how many new pages will be created and just rename the existing ones
# by that number. This way we avoid regenerating the existing pages.
#
my $pages_required = ($total_entries - $first_page_entries)/ITEMS_PER_PAGE; # pages required to fit the left entries
my $page_offset = ($pages_required + 1) - $total_pages; # +1 because of the first page (plain index.html)
if ($page_offset) {
my $total_new_entries = keys %$new_entries;
my $entry_offset = ITEMS_PER_PAGE + $extra_entries;
# print "tot ent: $total_entries\n";
# print "1st p ent: $first_page_entries\n";
# print "total pages: $total_pages\n";
# print "pages req: $pages_required\n";
# print "page offset: $page_offset\n";
# print "tot new en: $total_new_entries\n",
# print "ent offset: $entry_offset\n";
# copy the other pages to new page numbers (moved later back to WWW_PATH)
#
for my $page_number (2 .. $total_pages) {
my $new_page_number = $page_number + $page_offset;
my $src = COMPILED_PAGES_PATH . '/' . get_page_name($page_type, $page_number);
my $dst = get_page_name($page_type, $new_page_number);
my $data = {
navigation => build_navigation($total_entries, $page_type, $new_page_number),
topusers => \@top_users,
tophosts => \@top_hosts
};
$template->process($src, $data, $dst, binmode => ':utf8');
}
# since generate_page will be creating new index pages and their compiled versions
# we need to change their indexes
# for my $page_number (reverse 2 .. $total_pages) {
# my $src = COMPILED_PAGES_PATH . '/' . get_page_name($page_type, $page_number);
# my $dst = COMPILED_PAGES_PATH . '/' . get_page_name($page_type, $page_number + $page_offset);
# rename $src => $dst;
# }
my $current_page = 2;
my $current_item = 1;
@gen_entries = ();
for my $entry_idx ($entry_offset .. $entry_offset + $pages_required * ITEMS_PER_PAGE - 1) {
push @gen_entries, {
file => $compiled_entries[$entry_idx],
pos => $total_entries - $entry_idx
};
if ($current_item % ITEMS_PER_PAGE == 0) {
generate_page($page_type, $current_page, $total_entries, \@gen_entries);
@gen_entries = ();
$current_page++;
}
$current_item++;
}
}
}
}
#
# generate_page
#
# Given a page type, page number and entries, the function generates a static
# HTML page and puts it in OUTPUT_TMP_PATH directory
#
sub generate_page {
my ($page_type, $page_number, $total_entries, $entries) = @_;
my $outpage = get_page_name($page_type, $page_number);
my $data = {
last_update => strftime("%Y-%m-%d %H:%M:%S", localtime),
navigation => build_navigation($total_entries, $page_type, $page_number),
entries => $entries,
page_type => $page_type,
topusers => \@top_users,
tophosts => \@top_hosts,
};
$template->process('index.html', $data, $outpage, binmode => ':utf8');
# create a compiled version which will be used when moving pages
$data->{navigation} = '[% navigation %]';
$data->{topusers_tpl} = 1; # include top user template
$data->{tophosts_tpl} = 1; # include top host template
my $output;
$template->process('index.html', $data, \$output);
my $file_path = COMPILED_PAGES_PATH . "/$outpage";
open my $out, '>', $file_path or die "Error: could not open '$file_path': $!";
print $out $output;
close $out;
}
#
# get_page_name
#
# Given page type and page number, generates an index page filename
#
sub get_page_name {
my ($page_type, $page_number) = @_;
my $outpage;
if ($page_type eq 'main') {
$outpage = $page_number == 1 ? "index.html" : "index-$page_number.html";
}
else {
$outpage = $page_number == 1 ? "index-$page_type.html" : "index-$page_type-$page_number.html";
}
return $outpage;
}
#
# build_navigation
#
# given total number of entries, function builds navigation html code
# for a given type of page (main, pictures or videos)
#
sub build_navigation {
my ($total_entries, $type, $current) = @_;
$current ||= -1;
my $pages = int $total_entries / ITEMS_PER_PAGE;
my @navarr;
for my $page (1 .. $pages) {
# build page names
#
my $page_name;
if ($type eq "main") {
$page_name = "index";
}
else {
$page_name = "index-$type";
}
unless ($page == 1) {
$page_name .= "-$page";
}
$page_name .= ".html";
my $nav = {
href => $page_name,
page => $page,
current => $current
};
push @navarr, $nav;
}
my $output = '';
$template->process('navigation.html', { navs => \@navarr }, \$output);
return $output;
}
#
# generate_link
#
# Given a reddit entry, function generates /link//entry-title.html page
#
sub generate_link {
my $entry = shift;
my $entry_data = {
icon => get_icon($entry),
title => encode_entities($entry->{title}),
title_uri_esc => uri_escape($entry->{title}),
sane_title => sanitize_title($entry->{title}),
host => get_host($entry->{url}),
link_dir => get_link_dir($entry->{type}),
user => encode_entities($entry->{user}),
url => $entry->{url},
url_uri_esc => uri_escape($entry->{url}),
date_added => $entry->{date_added},
reddit_id => $entry->{reddit_id}
};
my %link_data = (
last_update => strftime("%Y-%m-%d %H:%M:%S", localtime),
title => encode_entities($entry->{title}),
topusers => \@top_users,
tophosts => \@top_hosts
);
my $output;
$template->process('link.html', { entry => $entry_data, %link_data }, \$output);
# build path to link file
my $link_path = LINK_DIR;
$link_path .= "/pictures" if $entry->{type} =~ /picture/;
$link_path .= "/videos" if $entry->{type} =~ /video/;
unless (-d $link_path) {
mkdir $link_path or die "Error: could not create '$link_path': $!";
}
$link_path .= '/' . substr($entry_data->{sane_title}, 0, 1);
unless (-d $link_path) {
mkdir $link_path or die "Error: could not create '$link_path': $!";
}
$link_path .= "/$entry_data->{sane_title}.html";
open my $out, '>', $link_path or die "Error: could not open '$link_path': $!";
print $out $output;
close $out;
}
#
# generate_entry_cache
#
# The function takes a reddit article entry and generates an entry cache file.
#
sub generate_entry_cache {
my $entry = shift;
return if !$regenerate and -e COMPILED_ENTRIES_PATH . "/$entry->{id}-$entry->{type}.html";
my $entry_data = {
icon => get_icon($entry),
title => encode_entities($entry->{title}),
title_uri_esc => uri_escape($entry->{title}),
sane_title => sanitize_title($entry->{title}),
host => get_host($entry->{url}),
link_dir => get_link_dir($entry->{type}),
user => encode_entities($entry->{user}),
url => $entry->{url},
url_uri_esc => uri_escape($entry->{url}),
date_added => $entry->{date_added},
reddit_id => $entry->{reddit_id}
};
$entry_data->{title_first_char} = substr($entry_data->{sane_title}, 0, 1);
my $output = '';
$template->process('index_entry.html', { entry => $entry_data } , \$output);
my $entry_dir = COMPILED_ENTRIES_PATH . '/' . get_cache_subdir($entry->{id});
unless (-d $entry_dir) {
mkdir $entry_dir or die "Error: could not create '$entry_dir': $!";
}
my $file_path = "$entry_dir/$entry->{id}-$entry->{type}.html";
open my $out, '>', $file_path or die "Error: could not open '$file_path': $!";
print $out $output;
close $out;
}
#
# get_cache_subdir
#
# Calculates cache subdir, see comments of ITEMS_PER_CACHE_DIR constant
#
sub get_cache_subdir {
my $id = shift;
return (int $id / ITEMS_PER_CACHE_DIR) * ITEMS_PER_CACHE_DIR;
}
#
# get_compiled_entries
#
# Given page_type, the function returns a list of compiled (cached) entries for a given type
#
sub get_compiled_entries {
my $page_type = shift;
my $entry_search_glob;
if ($page_type eq 'main') {
$entry_search_glob = "/*.html";
}
elsif ($page_type eq "pictures") {
$entry_search_glob = "/*{picture,pictures}.html";
}
elsif ($page_type eq "videos") {
$entry_search_glob = "/*{video,videos}.html";
}
my @entries;
my @entry_sub_dirs = grep -d, glob(COMPILED_ENTRIES_PATH . "/*");
foreach (@entry_sub_dirs) {
my @subentries = glob($_ . $entry_search_glob);
push @entries, @subentries;
}
# sort compiled entries by id and then reverse the list so the list began with newest entries
my @compiled_entries = reverse sort {;
no warnings 'numeric';
int basename($a) <=> int basename($b)
} @entries;
return @compiled_entries;
}
#
# get_index_pages
#
# Given page_type, the function returns a list of existing index*.html pages
#
sub get_index_pages {
my $page_type = shift;
my @index_pages;
if ($page_type eq 'main') {
# can't use a glob on main page because an 'index*.html' glob
# would match picture and video index pages as well
find(sub {
push @index_pages, $File::Find::name if $_ =~ /index(-\d+)?\.html$/
}, WWW_PATH
);
}
else {
my $page_search_glob = "/index-$page_type*.html";
@index_pages = glob(WWW_PATH . $page_search_glob);
}
my @sorted_index_pages = sort {
my $rx = qr/(\d+)\.html$/;
my ($an) = $a =~ /$rx/;
my ($bn) = $b =~ /$rx/;
return 1 unless defined $bn and defined $an; # take care of default pages like 'index.html'
$an <=> $bn;
} @index_pages;
return @sorted_index_pages;
}
#
# get_link_dir
#
# Given entry's type, returns link directory.
#
sub get_link_dir {
my $type = shift;
return "videos" if ($type =~ /video/);
return "pictures" if ($type =~ /picture/);
die "unknown entry type: $type";
}
#
# get_icon
#
# Given an entry, the function gets a thumbnail (icon) for the entry.
# For example, for youtube videos it gets thumbnail from youtube's servers.
# Or, for some blogspot page it tries to find the first image in the content,
# download it, make a thumbnail and cache it locally.
#
sub get_icon {
my $entry = shift;
my $sane_title = sanitize_title($entry->{title});
my $cached_icon_path = IMAGE_CACHE_PATH . '/' . get_cache_subdir($entry->{id});
unless (-d $cached_icon_path) {
mkdir $cached_icon_path;
}
$cached_icon_path .= "/$entry->{id}-$sane_title.jpg";
my $rel_www_icon_path = IMAGE_RELATIVE_WWW . '/' . get_cache_subdir($entry->{id});
unless (-d $rel_www_icon_path) {
mkdir $rel_www_icon_path;
}
$rel_www_icon_path .= "/$entry->{id}-$sane_title.jpg";
return $rel_www_icon_path if -e $cached_icon_path; # return cached icon
my $thex = ThumbExtractor->new;
my $thumb = $thex->get_thumbnail($entry->{url});
unless (defined $thumb) { # no thumb was found
if ($entry->{type} =~ /video/) {
# each video site requires a custom written handler for extracting thumbnails
# if there was none, display default icon
print STDERR "Couldn't extract thumbnail for video site at '$entry->{url}'\n" if DEBUG;
return get_default_icon($entry->{type});
}
# let's find the best image on the page
my $image_finder = ImageFinder->new(netpbm => NETPBM_PATH);
my $best_img = $image_finder->find_best_image($entry->{url});
unless ($best_img) { # no best image, hmm.
print STDERR "No best image was found at '$entry->{url}'\n" if DEBUG;
return get_default_icon($entry->{type});
}
# create a thumbnail for this image
my $thumb_maker = ThumbMaker->new(netpbm => NETPBM_PATH);
my $success = $thumb_maker->create_thumbnail($best_img, $cached_icon_path,
{ width => 77, height => 77, border => 1, border_color => '#C7DEF7' });
unlink $best_img;
unless ($success) {
print STDERR $thumb_maker->get_error, "\n" if DEBUG;
return get_default_icon($entry->{type});
}
return $rel_www_icon_path;
}
if ($thumb->is_thumb) { # a real thumbnail
return $thumb->url;
}
else { # just an image
my $thumb_maker = ThumbMaker->new(netpbm => NETPBM_PATH);
my $success = $thumb_maker->create_thumbnail($thumb->url, $cached_icon_path,
{ width => 77, height => 77, border => 1, border_color => '#C7DEF7' });
unless ($success) {
print STDERR $thumb_maker->get_error, "\n" if DEBUG;
return get_default_icon($entry->{type});
}
}
return $rel_www_icon_path;
}
#
# get_default_icon
#
sub get_default_icon {
my $type = shift;
return "/icons/$type-big.gif";
}
#
# sanitize_title
#
# given a title of a reddit story, the function sanitizes the title:
# removes [ ]'s, ( )'s, etc. and then replaces all non alphanumeric chars with '-'
#
sub sanitize_title {
my $title = lc shift;
$title =~ s{\[|\]|\(|\)|'}{}g;
$title =~ s/[^[:alnum:]]/-/g;
# get rid of multiple -'s
$title =~ s/-{2,}/-/g;
# get rid of leading and trailing -'s
$title =~ s/^-+|-+$//g;
if (length $title > 100) {
$title = substr($title, 0, 100);
$title =~ s/-*$//g; # there might now be one - at the end again
$title =~ s/-[[:alnum:]]*$//g;
}
return $title;
}
#
# get_host
#
# given a URL, the function returns host portion of it
#
sub get_host {
my $url = shift;
my $uri = URI->new($url);
if ($uri->can('host')) {
return $uri->host;
}
return "unknown";
}
#
# get_top_users
#
# Subroutine returns an array of hashrefs of top 10 users
# Each hash hash two keys 'user' and 'posts'
#
sub get_top_users {
my $top_users_query =<selectall_arrayref($top_users_query);
my @ret;
foreach (@$top_users) {
push @ret, {
user => $_->[0],
total_posts => $_->[1]
}
}
return @ret;
}
#
# get_top_hosts
#
# Subroutine returns an array of hashrefs of top 10 domains
# Each hash hash two keys 'host' and 'posts'
#
sub get_top_hosts {
my $urls_query = "SELECT url FROM reddit";
my $urls = $dbh->selectall_arrayref($urls_query);
my %hosts;
foreach (@$urls) {
my $uri = URI->new($_->[0]);
if ($uri->can('host')) {
my $host = $uri->host;
$host =~ s/^www\.//;
$host =~ s/.*?(\w+\.\w+)$/$1/;
$hosts{$host}++;
}
}
my @ret;
my @sorted_keys = sort { $hosts{$b} <=> $hosts{$a} } keys %hosts;
foreach (@sorted_keys[0..(@sorted_keys < 10 ? $#sorted_keys : 9)]) {
push @ret, {
host => $_,
total_posts => $hosts{$_}
}
}
return @ret;
}
#
# get_new_entries
#
# Given the page type, function returns a hash of new entries, where hash key is
# the id of entry
#
sub get_new_entries {
my $page_type = shift;
my $entry_query;
my $had_where = 0;
my %run_status = get_run_status();
if (exists $run_status{last_id} && !$regenerate) {
$entry_query = "SELECT * FROM reddit WHERE id > $run_status{last_id}";
$had_where = 1;
}
else {
$entry_query = "SELECT * FROM reddit"
}
if ($page_type eq "pictures") {
if ($had_where) {
$entry_query .= " and";
}
else {
$entry_query .= " WHERE";
}
$entry_query .= " type = 'picture' or type = 'pictures'";
}
elsif ($page_type eq "videos") {
if ($had_where) {
$entry_query .= " and";
}
else {
$entry_query .= " WHERE";
}
$entry_query .= " type = 'video' or type = 'videos'";
}
my $entries = $dbh->selectall_hashref($entry_query, [ 'id' ]);
return %{$entries || {}};
}
#
# get_run_status
#
# Queries the reddit_status table and returns a hash of status values
#
sub get_run_status {
my $run_status = $dbh->selectrow_hashref("SELECT * FROM reddit_status");
return %{$run_status || {last_id => 0}};
}