From a1226d144ef2acb8a843796598ff6d48cf4abb31 Mon Sep 17 00:00:00 2001 From: Michael Werner Date: Thu, 14 May 2020 20:48:32 +0200 Subject: [PATCH 1/2] Remove unsafe dereference operations --- .../thelandscape/krawler/crawler/Krawler.kt | 39 ++++++++++--------- .../io/thelandscape/krawler/http/Requests.kt | 22 ++++++----- .../thelandscape/krawler/robots/RoboMinder.kt | 3 +- .../thelandscape/krawler/robots/RobotsTxt.kt | 15 +++---- .../io/thelandscape/KrawlQueueDaoTest.kt | 6 +-- 5 files changed, 45 insertions(+), 40 deletions(-) diff --git a/src/main/kotlin/io/thelandscape/krawler/crawler/Krawler.kt b/src/main/kotlin/io/thelandscape/krawler/crawler/Krawler.kt index 34eeb2e..c6cdc10 100644 --- a/src/main/kotlin/io/thelandscape/krawler/crawler/Krawler.kt +++ b/src/main/kotlin/io/thelandscape/krawler/crawler/Krawler.kt @@ -52,13 +52,15 @@ import java.util.concurrent.atomic.AtomicInteger * */ abstract class Krawler(val config: KrawlConfig = KrawlConfig(), - private var krawlHistory: KrawlHistoryIf? = null, - private var krawlQueues: List? = null, + krawlHistory: KrawlHistoryIf? = null, + krawlQueues: List? = null, robotsConfig: RobotsConfig? = null, private val requestProvider: RequestProviderIf = Requests(config), private val job: Job = Job()) { private val logger: Logger = LogManager.getLogger() + private val krawlHistory: KrawlHistoryIf + private val krawlQueues: List // Map of start URL -> int id to track branches of a crawl private val rootPageIds: MutableMap = mutableMapOf() @@ -66,21 +68,22 @@ abstract class Krawler(val config: KrawlConfig = KrawlConfig(), private val maximumUsedId: AtomicInteger = AtomicInteger(0) init { - if (krawlHistory == null || krawlQueues == null) { + this.krawlHistory = if (krawlHistory == null) { val hsqlConnection = HSQLConnection(config.persistentCrawl, config.crawlDirectory) + KrawlHistoryHSQLDao(hsqlConnection.hsqlSession) + } else { + krawlHistory + } - if (krawlHistory == null) - krawlHistory = KrawlHistoryHSQLDao(hsqlConnection.hsqlSession) - + this.krawlQueues = if (krawlQueues == null) { // This is safe because we don't have any KrawlHistoryIf implementations other than HSQL - val histDao: KrawlHistoryHSQLDao = krawlHistory as KrawlHistoryHSQLDao - - if (krawlQueues == null) - // TODO: Dynamic number of queues? Why 10? - krawlQueues = (0 until 10).map { - KrawlQueueHSQLDao("queue$it", hsqlConnection.hsqlSession, histDao) - } - + val histDao: KrawlHistoryHSQLDao = this.krawlHistory as KrawlHistoryHSQLDao + // TODO: Dynamic number of queues? Why 10? + (0 until 10).map { + KrawlQueueHSQLDao("queue$it", this.krawlHistory.session, histDao) + } + } else { + krawlQueues } job.invokeOnCompletion { @@ -89,7 +92,7 @@ abstract class Krawler(val config: KrawlConfig = KrawlConfig(), } } - private val scheduledQueue: ScheduledQueue = ScheduledQueue(krawlQueues!!, config, job) + private val scheduledQueue: ScheduledQueue = ScheduledQueue(this.krawlQueues, config, job) /** * Handle robots.txt @@ -254,7 +257,7 @@ abstract class Krawler(val config: KrawlConfig = KrawlConfig(), onCrawlStart() val urls: Channel = scheduledQueue.krawlQueueEntryChannel - repeat(krawlQueues!!.size) { + repeat(krawlQueues.size) { GlobalScope.launch(Dispatchers.Default) { val actions: ReceiveChannel = produceKrawlActions(urls) doCrawl(actions) @@ -368,12 +371,12 @@ abstract class Krawler(val config: KrawlConfig = KrawlConfig(), // Do a history check val history: KrawlHistoryEntry = - if (krawlHistory!!.hasBeenSeen(krawlUrl)) { // If it has been seen + if (krawlHistory.hasBeenSeen(krawlUrl)) { // If it has been seen onRepeatVisit(krawlUrl, parent) logger.debug("History says no") return@async KrawlAction.Noop } else { - krawlHistory!!.insert(krawlUrl) + krawlHistory.insert(krawlUrl) } val visit = shouldVisit(krawlUrl) diff --git a/src/main/kotlin/io/thelandscape/krawler/http/Requests.kt b/src/main/kotlin/io/thelandscape/krawler/http/Requests.kt index b025fd3..da8eca6 100644 --- a/src/main/kotlin/io/thelandscape/krawler/http/Requests.kt +++ b/src/main/kotlin/io/thelandscape/krawler/http/Requests.kt @@ -80,13 +80,15 @@ internal class HistoryTrackingRedirectStrategy: DefaultRedirectStrategy() { private val pcm: PoolingHttpClientConnectionManager = PoolingHttpClientConnectionManager() class Requests(private val krawlConfig: KrawlConfig, - private var httpClient: CloseableHttpClient? = null) : RequestProviderIf { + httpClient: CloseableHttpClient? = null) : RequestProviderIf { private val logger: Logger = LogManager.getLogger() + private val httpClient: CloseableHttpClient init { - if (httpClient == null) { - val requestConfig = RequestConfig.custom() + this.httpClient = + if (httpClient == null) { + val requestConfig = RequestConfig.custom() .setCookieSpec(CookieSpecs.STANDARD) .setExpectContinueEnabled(false) .setContentCompressionEnabled(krawlConfig.allowContentCompression) @@ -96,15 +98,15 @@ class Requests(private val krawlConfig: KrawlConfig, .setSocketTimeout(krawlConfig.socketTimeout) .build() - val trustStrat = TrustStrategy { _: Array, _: String -> true } + val trustStrat = TrustStrategy { _: Array, _: String -> true } - val sslContext = SSLContextBuilder.create() + val sslContext = SSLContextBuilder.create() .loadTrustMaterial(null, trustStrat) .build() - val redirectStrategy = HistoryTrackingRedirectStrategy() + val redirectStrategy = HistoryTrackingRedirectStrategy() - httpClient = HttpClients.custom() + HttpClients.custom() .setDefaultRequestConfig(requestConfig) .setSSLContext(sslContext) .setSSLHostnameVerifier(NoopHostnameVerifier()) @@ -112,7 +114,9 @@ class Requests(private val krawlConfig: KrawlConfig, .setUserAgent(krawlConfig.userAgent) .setConnectionManager(pcm) .build() - } + } else { + httpClient + } } /** Fetch the robots.txt file from a domain @@ -186,7 +190,7 @@ class Requests(private val krawlConfig: KrawlConfig, } val resp: RequestResponse = try { - val response: HttpResponse? = httpClient!!.execute(req, httpContext) + val response: HttpResponse? = httpClient.execute(req, httpContext) if (response == null) ErrorResponse(url) else retFun(url, response, httpContext) } catch (e: Throwable) { ErrorResponse(url, e.toString()) diff --git a/src/main/kotlin/io/thelandscape/krawler/robots/RoboMinder.kt b/src/main/kotlin/io/thelandscape/krawler/robots/RoboMinder.kt index 85d82da..22080a0 100644 --- a/src/main/kotlin/io/thelandscape/krawler/robots/RoboMinder.kt +++ b/src/main/kotlin/io/thelandscape/krawler/robots/RoboMinder.kt @@ -90,6 +90,7 @@ class RoboMinder(private val userAgent: String, rules.put(url.hierarchicalPart, process(resp)) } - return rules.getIfPresent(url.hierarchicalPart)!!.invoke(withoutGetParams) + val rule = rules.getIfPresent(url.hierarchicalPart) ?: return true + return rule(withoutGetParams) } } diff --git a/src/main/kotlin/io/thelandscape/krawler/robots/RobotsTxt.kt b/src/main/kotlin/io/thelandscape/krawler/robots/RobotsTxt.kt index 9b8841c..4ecd021 100644 --- a/src/main/kotlin/io/thelandscape/krawler/robots/RobotsTxt.kt +++ b/src/main/kotlin/io/thelandscape/krawler/robots/RobotsTxt.kt @@ -28,8 +28,7 @@ class RobotsTxt(url: KrawlUrl, response: HttpResponse, context: HttpClientContex // TODO: Improve handling: https://en.wikipedia.org/wiki/Robots_exclusion_standard#About_the_standard - var disallowRules: Map> = mapOf() - private set + val disallowRules: Map> // Do all the parsing in a single pass in here init { @@ -47,14 +46,12 @@ class RobotsTxt(url: KrawlUrl, response: HttpResponse, context: HttpClientContex if (key == "user-agent") { userAgent = value continue - } - - if (key == "disallow") { - if (userAgent in rules) - rules[userAgent]!!.add(value) - else + } else if (key == "disallow") { + if (userAgent in rules) { + rules.getOrPut(userAgent, { mutableSetOf() }).add(value) + } else { rules[userAgent] = mutableSetOf(value) - + } continue } } diff --git a/src/test/kotlin/io/thelandscape/KrawlQueueDaoTest.kt b/src/test/kotlin/io/thelandscape/KrawlQueueDaoTest.kt index b189fa3..427271e 100644 --- a/src/test/kotlin/io/thelandscape/KrawlQueueDaoTest.kt +++ b/src/test/kotlin/io/thelandscape/KrawlQueueDaoTest.kt @@ -61,7 +61,7 @@ class KrawlQueueDaoTest { KrawlQueueEntry("http://www.e5.com/", 0, KrawlHistoryEntry(), timestamp = LocalDateTime.now()) ) queueDao.push(pushers) - val res = queueDao.deleteByAge(LocalDateTime.now().minusDays(1)) - assertEquals(2, res) + val res = queueDao.deleteByAge(LocalDateTime.now()) + assertEquals(5, res) } -} \ No newline at end of file +} From fc163d96e42941ffe7ea9c912b1176b0d42390fe Mon Sep 17 00:00:00 2001 From: Michael Werner Date: Thu, 14 May 2020 20:55:53 +0200 Subject: [PATCH 2/2] Revert change in test --- src/test/kotlin/io/thelandscape/KrawlQueueDaoTest.kt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/test/kotlin/io/thelandscape/KrawlQueueDaoTest.kt b/src/test/kotlin/io/thelandscape/KrawlQueueDaoTest.kt index 427271e..ed8b529 100644 --- a/src/test/kotlin/io/thelandscape/KrawlQueueDaoTest.kt +++ b/src/test/kotlin/io/thelandscape/KrawlQueueDaoTest.kt @@ -61,7 +61,7 @@ class KrawlQueueDaoTest { KrawlQueueEntry("http://www.e5.com/", 0, KrawlHistoryEntry(), timestamp = LocalDateTime.now()) ) queueDao.push(pushers) - val res = queueDao.deleteByAge(LocalDateTime.now()) - assertEquals(5, res) + val res = queueDao.deleteByAge(LocalDateTime.now().minusDays(1)) + assertEquals(2, res) } }