Skip to content
This repository has been archived by the owner on Aug 14, 2023. It is now read-only.

Commit

Permalink
new arch WIP
Browse files Browse the repository at this point in the history
  • Loading branch information
yamin8000 committed Apr 4, 2023
1 parent e9888fc commit d26872c
Show file tree
Hide file tree
Showing 10 changed files with 220 additions and 16 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,13 @@ import io.github.yamin8000.twitterscrapper.helpers.ConsoleHelper.errorStyle
import io.github.yamin8000.twitterscrapper.helpers.ConsoleHelper.t
import io.github.yamin8000.twitterscrapper.model.User
import io.github.yamin8000.twitterscrapper.util.Constants.instances
import io.github.yamin8000.twitterscrapper.util.Utility.sanitizeNum
import io.github.yamin8000.twitterscrapper.util.Utility.sanitizeUsername
import org.jsoup.Jsoup
import kotlin.random.Random
import kotlin.random.nextInt

object UserHelper {
object UserInfoHelper {
@Throws(Exception::class)
suspend fun getUser(
username: String,
Expand All @@ -19,12 +22,13 @@ object UserHelper {
else getUserFailedRequest(username, response.code)
}

@Throws(Exception::class)
private suspend fun getUserFailedRequest(
username: String,
httpCode: Int
): User? {
val temp = instances.drop(0)
if (temp.isNotEmpty()) return getUser(username, temp.first())
if (temp.isNotEmpty()) return getUser(username, instances[Random.nextInt(instances.indices)])
else throw Exception("Fetching info for user: $username failed with $httpCode")
}

Expand Down Expand Up @@ -54,6 +58,4 @@ object UserHelper {
null
}
}

private fun String?.sanitizeNum() = this?.filter { it != ',' }?.toIntOrNull() ?: 0
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,161 @@
package io.github.yamin8000.twitterscrapper.helpers

import io.github.yamin8000.twitterscrapper.helpers.UserInfoHelper.getUser
import io.github.yamin8000.twitterscrapper.model.Tweet
import io.github.yamin8000.twitterscrapper.model.TweetStats
import io.github.yamin8000.twitterscrapper.model.TweetsPage
import io.github.yamin8000.twitterscrapper.util.Constants.DEFAULT_TWEETS_LIMIT
import io.github.yamin8000.twitterscrapper.util.Constants.FAILED_REQUEST_DELAY
import io.github.yamin8000.twitterscrapper.util.Constants.instances
import io.github.yamin8000.twitterscrapper.util.Utility.sanitizeNum
import io.github.yamin8000.twitterscrapper.util.Utility.sanitizeUsername
import io.github.yamin8000.twitterscrapper.web.retryingGet
import kotlinx.coroutines.*
import org.jsoup.Jsoup
import org.jsoup.nodes.Document
import org.jsoup.nodes.Element
import kotlin.random.Random
import kotlin.random.nextLong

class UserTweetsRequest(
private val username: String,
private val limit: Int = DEFAULT_TWEETS_LIMIT
) {
suspend fun get(): List<Tweet> {
var page: TweetsPage? = null
val tweets = mutableListOf<Tweet>()
do {
page = getUserTweetsPage(page?.cursor ?: "")
tweets.addAll(page.tweets)
} while (page?.cursor != null && page.cursor?.isNotBlank() == true && tweets.size < limit)
return tweets.take(limit)
}

private suspend fun getUserTweetsPage(
cursor: String? = "",
delayTime: Long = 0
): TweetsPage {
delay(delayTime)
val response = withContext(Dispatchers.IO) {
retryingGet("${username.sanitizeUsername()}?cursor=$cursor")
}
return if (response != null) {
if (response.isSuccessful) {
parseUserTweetsPage(response.body.string())
} else handleUserTweetsPageError(cursor, response.code)
} else throw Exception("Failed to retrieve tweets page for $username")
}

private suspend fun handleUserTweetsPageError(
cursor: String? = "",
httpCode: Int
): TweetsPage {
return when (httpCode) {
404 -> throw NullPointerException("$username not found")
503 -> getUserTweetsPage(cursor, Random.nextLong(FAILED_REQUEST_DELAY))
else -> throw Exception("Failed to retrieve tweets page for $username")
}
}

private suspend fun parseUserTweetsPage(
html: String
): TweetsPage {
val doc = Jsoup.parse(html)
val cursor = parseCursor(doc)
val tweets = withContext(Dispatchers.IO) {
handleUserTweetsParsing(doc.selectFirst("div[class^=timeline-container] > div[class^=timeline]"))
}
return TweetsPage(tweets, cursor)
}

private fun parseCursor(doc: Document) = doc.selectFirst("div[class^=show-more] a")?.attr("href")

private suspend fun handleUserTweetsParsing(timeline: Element?): List<Tweet> {
return if (timeline != null) parseUserTweets(timeline) else listOf()
}

private suspend fun parseUserTweets(
timeline: Element
): List<Tweet> {
return buildList {
timeline.children().forEach { tweet ->
val htmlClass = tweet.className()
if (htmlClass.startsWith("timeline-item"))
getTimelineItem(tweet)?.let { add(it) }
if (htmlClass.startsWith("thread-line"))
tweet.children().forEach { item -> getTimelineItem(item)?.let { add(it) } }
}
}
}

private suspend fun getTimelineItem(
tweet: Element?
): Tweet? {
if (tweet != null) {
val thread = tweet.selectFirst("a[class^=show-thread]")
val retweet = tweet.selectFirst("div[class^=retweet-header]")
val username = getTweetUsername(tweet)
return Tweet(
content = tweet.selectFirst("div[class^=tweet-content]")?.text() ?: "",
date = getTweetDate(tweet),
link = "${instances.first().dropLast(1)}${getTweetLink(tweet)}",
user = if (username == null) null else getUser(username),
stats = getTweetStats(tweet),
isRetweet = retweet != null,
isThreaded = thread != null,
isPinned = tweet.selectFirst("div[class^=pinned]") != null,
replies = listOf(),
originalTweeter = getOriginalTweeter(retweet, username),
quote = getQuotedTweet(tweet.selectFirst("div[class^=quote]")),
thread = "${instances.first().dropLast(1)}${thread?.attr("href") ?: ""}"
)
} else return null
}

private suspend fun getQuotedTweet(
quote: Element?
): Tweet? {
return if (quote != null) {
val username = getTweetUsername(quote)
Tweet(
content = quote.selectFirst("div[class^=quote-text]")?.text() ?: "",
date = getTweetDate(quote),
link = quote.selectFirst("a[class^=quote-link]")?.attr("href") ?: "",
user = if (username == null) null else getUser(username),
stats = TweetStats(),
isRetweet = false,
isThreaded = false,
isPinned = false,
)
} else null
}

private suspend fun getOriginalTweeter(
retweet: Element?,
username: String?
) = if (retweet == null || username == null) null else getUser(username)

private fun getTweetUsername(tweet: Element) = tweet.selectFirst("a[class^=username]")?.text()?.sanitizeUsername()

private fun getTweetDate(tweet: Element) = tweet.selectFirst("span[class^=tweet-date] a")?.attr("title") ?: ""

private fun getTweetLink(tweet: Element) = tweet.selectFirst("a[class^=tweet-link]")?.attr("href") ?: ""

private fun getTweetStats(
tweet: Element
): TweetStats {
val rawStats = tweet.selectFirst("div[class^=tweet-stat]")
val stats = TweetStats()
rawStats?.children()?.forEach { stat ->
val icon = stat.selectFirst("div[class^=icon-container]")
val value = icon?.text().sanitizeNum()
when (icon?.children()?.firstOrNull()?.className() ?: "") {
"icon-comment" -> stats.replies = value
"icon-retweet" -> stats.retweets = value
"icon-quote" -> stats.quotes = value
"icon-heart" -> stats.likes = value
}
}
return stats
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,7 @@ data class Tweet(
val content: String,
val date: String,
val link: String,
val contentType: TweetContentType,
val user: User,
val user: User?,
val stats: TweetStats,
val isRetweet: Boolean,
val isThreaded: Boolean,
Expand Down
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
package io.github.yamin8000.twitterscrapper.model

data class TweetStats(
val replies: Int = 0,
val retweets: Int = 0,
val quotes: Int = 0,
val likes: Int = 0,
var replies: Int = 0,
var retweets: Int = 0,
var quotes: Int = 0,
var likes: Int = 0,
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
package io.github.yamin8000.twitterscrapper.model

data class TweetsPage(
val tweets: List<Tweet>,
val cursor: String?
)
Original file line number Diff line number Diff line change
@@ -1,18 +1,18 @@
package io.github.yamin8000.twitterscrapper.modules

import io.github.yamin8000.twitterscrapper.helpers.ConsoleHelper.printTable
import io.github.yamin8000.twitterscrapper.helpers.ConsoleHelper.table
import io.github.yamin8000.twitterscrapper.helpers.ConsoleHelper.readInteger
import io.github.yamin8000.twitterscrapper.helpers.ConsoleHelper.readMultipleStrings
import io.github.yamin8000.twitterscrapper.helpers.ConsoleHelper.t
import io.github.yamin8000.twitterscrapper.helpers.UserHelper.getUser
import io.github.yamin8000.twitterscrapper.helpers.UserInfoHelper.getUser
import io.github.yamin8000.twitterscrapper.helpers.UserTweetsRequest
import io.github.yamin8000.twitterscrapper.model.User
import io.github.yamin8000.twitterscrapper.util.Constants.DOWNLOAD_PATH
import io.github.yamin8000.twitterscrapper.util.FileUtils
import io.github.yamin8000.twitterscrapper.util.Menus
import io.github.yamin8000.twitterscrapper.util.Utility.csv
import kotlinx.coroutines.*
import kotlinx.coroutines.flow.Flow
import kotlinx.coroutines.flow.flow
import kotlinx.coroutines.runBlocking
import java.io.File

class UsersModule : BaseModule(Menus.userMenu) {
Expand All @@ -28,12 +28,28 @@ class UsersModule : BaseModule(Menus.userMenu) {
0 -> showMenu()
1 -> runBlocking { showUsersInfo() }
2 -> runBlocking { saveUsersInfo() }
3 -> runBlocking {
getUserTweets()
}
}

run()
return 0
}

private suspend fun getUserTweets() {
val users = readMultipleStrings("Username")
val limit = readInteger(
message = "Enter number of tweets",
range = 1..1000
)
users.forEach { user ->
UserTweetsRequest(user, limit).get().forEach {
println(it)
}
}
}

private suspend fun saveUsersInfo() {
getUsersInfo().collect {
File("$path/${it.username.drop(1)}.txt").apply {
Expand All @@ -48,7 +64,7 @@ class UsersModule : BaseModule(Menus.userMenu) {
}

private suspend fun getUsersInfo(): Flow<User> = flow {
readMultipleStrings("User").forEach { username ->
readMultipleStrings("Username").forEach { username ->
getUser(username)?.let { emit(it) }
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ object Constants {

var DEFAULT_TWEETS_LIMIT = 500

val FAILED_REQUEST_DELAY = 50L..500L

const val ERROR_503 = "503 Service Temporarily Unavailable"

val instances = listOf(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ object Menus {
val userMenu = """
1. Get user(s) info
2. Save user(s) info
3. Get users' tweets
3. Save users' tweets
0. Back
""".trimIndent()

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,4 +26,6 @@ object Utility {
) = csvOf(headers, this, itemBuilder)

fun String.sanitizeUsername() = this.lowercase().trim().removePrefix("@")

fun String?.sanitizeNum() = this?.filter { it != ',' }?.toIntOrNull() ?: 0
}
16 changes: 16 additions & 0 deletions src/main/kotlin/io/github/yamin8000/twitterscrapper/web/Client.kt
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
package io.github.yamin8000.twitterscrapper.web

import io.github.yamin8000.twitterscrapper.util.Constants.instances
import okhttp3.*
import okio.IOException
import kotlin.coroutines.resume
Expand All @@ -9,6 +10,21 @@ import kotlin.jvm.Throws

private var client = OkHttpClient()

suspend fun retryingGet(
partialUrl: String,
retries: Int = 0,
base: String = instances[retries],
retriesLimit: Int = instances.size
): Response? = try {
val response = get("$base$partialUrl")
if (response.isSuccessful) response
else throw Exception("Request for $base$partialUrl failed, retrying.")
} catch (e: Exception) {
if (retries < retriesLimit) {
retryingGet(partialUrl, retries + 1, base, retriesLimit)
} else null
}

/**
* This hungry implementation of http get call using OkHttp,
* is consistent to get a successful result by costing
Expand Down

0 comments on commit d26872c

Please sign in to comment.