Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix my.twitter.android #374

Merged
merged 2 commits into from
Aug 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion my/core/pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ def check_dateish(s: SeriesT[S1]) -> Iterable[str]:
all_timestamps = s.apply(lambda x: isinstance(x, (pd.Timestamp, datetime))).all()
if not all_timestamps:
return # not sure why it would happen, but ok
tzs = s.map(lambda x: x.tzinfo).drop_duplicates()
tzs = s.map(lambda x: x.tzinfo).drop_duplicates() # type: ignore[union-attr, var-annotated, arg-type, return-value, unused-ignore]
examples = s[tzs.index]
# todo not so sure this warning is that useful... except for stuff without tz
yield f'''
Expand Down
97 changes: 61 additions & 36 deletions my/twitter/android.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from pathlib import Path
import re
from struct import unpack_from
from typing import Iterator, Sequence
from typing import Iterator, Sequence, Set

from my.core import datetime_aware, get_files, LazyLogger, Paths, Res
from my.core.common import unique_everseen
Expand Down Expand Up @@ -209,41 +209,66 @@ def get_own_user_id(conn) -> str:
# 6 : always notifications??
# 42: tweets (bulk of them)
def _process_one(f: Path, *, where: str) -> Iterator[Res[Tweet]]:
# meh... maybe separate this function into special ones for tweets/bookmarks/likes
select_own = _SELECT_OWN_TWEETS in where
with sqlite_connect_immutable(f) as db:
if _SELECT_OWN_TWEETS in where:
if select_own:
own_user_id = get_own_user_id(db)
where = where.replace(_SELECT_OWN_TWEETS, own_user_id)

for (
tweet_id,
user_name,
user_username,
created_ms,
blob,
) in db.execute(
f'''
db_where = where.replace(_SELECT_OWN_TWEETS, own_user_id)
else:
db_where = where

# NOTE: we used to get this from 'timeline_view'
# however seems that it's missing a fair amount of data that's present instatuses table...
QUERY = '''
SELECT
statuses_status_id,
users_name,
users_username,
statuses_created,
CAST(statuses_content AS BLOB)
FROM timeline_view
WHERE timeline_data_type == 1 /* the only one containing tweets (among with some other stuff) */
AND timeline_data_type_group != 6 /* excludes notifications (some of them even have statuses_bookmarked == 1) */
AND {where}
ORDER BY timeline_sort_index DESC
''',
# TODO not sure about timeline_sort_index for favorites
):
assert blob is not None # just in case, but should be filtered by the sql query
yield Tweet(
id_str=tweet_id,
# TODO double check it's utc?
created_at=datetime.fromtimestamp(created_ms / 1000, tz=timezone.utc),
screen_name=user_username,
text=_parse_content(blob),
)
CAST(statuses.status_id AS TEXT), /* int by default */
users.username,
statuses.created,
CAST(statuses.content AS BLOB),
statuses.quoted_tweet_id
FROM statuses FULL OUTER JOIN users
ON statuses.author_id == users.user_id
WHERE
/* there are sometimes a few shitty statuses in the db with weird ids which are duplicating other tweets
don't want to filter by status_id < 10 ** 10, since there might legit be statuses with low ids?
so this is the best I came up with..
*/
NOT (statuses.in_r_user_id == -1 AND statuses.in_r_status_id == -1 AND statuses.conversation_id == 0)
'''

def _query_one(*, where: str, quoted: Set[int]) -> Iterator[Res[Tweet]]:
for (
tweet_id,
user_username,
created_ms,
blob,
quoted_id,
) in db.execute(f'{QUERY} AND {where}'):
quoted.add(quoted_id) # if no quoted tweet, id is 0 here

try:
content = _parse_content(blob)
except Exception as e:
yield e
continue

yield Tweet(
id_str=tweet_id,
# TODO double check it's utc?
created_at=datetime.fromtimestamp(created_ms / 1000, tz=timezone.utc),
screen_name=user_username,
text=content,
)

quoted: Set[int] = set()
yield from _query_one(where=db_where, quoted=quoted)
# get quoted tweets 'recursively'
# TODO maybe do it for favs/bookmarks too? not sure
while select_own and len(quoted) > 0:
db_where = 'status_id IN (' + ','.join(map(str, sorted(quoted))) + ')'
quoted = set()
yield from _query_one(where=db_where, quoted=quoted)


def _entities(*, where: str) -> Iterator[Res[Tweet]]:
Expand All @@ -264,15 +289,15 @@ def bookmarks() -> Iterator[Res[Tweet]]:
# NOTE: in principle we get the bulk of bookmarks via timeline_type == 30 filter
# however we still might miss on a few (I think the timeline_type 30 only refreshes when you enter bookmarks in the app)
# if you bookmarked in the home feed, it might end up as status_bookmarked == 1 but not necessarily as timeline_type 30
return _entities(where='statuses_bookmarked == 1')
return _entities(where='statuses.bookmarked == 1')


def likes() -> Iterator[Res[Tweet]]:
# NOTE: similarly to bookmarks, we could use timeline_type == 29, but it's only refreshed if we actually open likes tab
return _entities(where='statuses_favorited == 1')
return _entities(where='statuses.favorited == 1')


def tweets() -> Iterator[Res[Tweet]]:
# NOTE: where timeline_type == 18 covers quite a few of our on tweets, but not everything
# querying by our own user id seems the most exhaustive
return _entities(where=f'timeline_sender_id == {_SELECT_OWN_TWEETS}')
return _entities(where=f'users.user_id == {_SELECT_OWN_TWEETS} OR statuses.retweeted == 1')
Loading