Add storing full text in journal directories.
diff --git a/exe/mathnet b/exe/mathnet
index 040a893..b39976a 100755
--- a/exe/mathnet
+++ b/exe/mathnet
@@ -9,6 +9,12 @@
class MathnetApplication
include Commander::Methods
+ def initialize
+ @minimal_interval = 1.0
+ @maximal_elapsed_time = 600.0
+ @base_dir = 'mathnet'
+ end
+
def run
program :name, 'Mathnet crawler.'
program :version, Mathnet::Crawler::VERSION
@@ -84,13 +90,14 @@
def download_aricles(articles)
Parallel.each(articles, :progress => 'Download texts') do |article|
- pdf_path = File.join 'mathnet', "#{article.title}.pdf"
+ pdf_path = article_path article
process_backoff do
article.full_text do |body|
pdf = File.new pdf_path, 'w'
pdf.write body
pdf.close
end
+ true
end
end
end
@@ -106,10 +113,16 @@
end
end
+ def article_path(article)
+ directory = File.join @base_dir, article.journal_title
+ if not Dir.exist? directory
+ FileUtils.mkdir_p directory
+ end
+ File.join directory, "#{article.title}.pdf"
+ end
+
def backoff
- minimal_interval = 0.1
- maximal_elapsed_time = 60.0
- ExponentialBackoff.new minimal_interval, maximal_elapsed_time
+ ExponentialBackoff.new @minimal_interval, @maximal_elapsed_time
end
end
diff --git a/lib/mathnet/crawler.rb b/lib/mathnet/crawler.rb
index 4052a62..ab4ef11 100644
--- a/lib/mathnet/crawler.rb
+++ b/lib/mathnet/crawler.rb
@@ -130,7 +130,7 @@
def full_text_url
client = HTTPClient.new
document = client.get_document @detail_url
- links = document.css(Entry::CSS_FILTER).select do |tag|
+ links = document.css(Entry::Listable::CSS_FILTER).select do |tag|
@pdf_url_reqexp.match tag['href']
end
fail ArgumentError, 'there is no full text link.' if links.empty?