1 year ago
#376749
Rachele Franceschini
Data mining inside twitter but getting less data
I would like to get tweets using specific keywords, within specific time, using specific language. I have all credentials from Twitter (Developer Portal-key,token,bearer). I have tried one extraction using this code.
import tweepy
from twitter_authentication import bearer_token
import time
import pandas as pd
client = tweepy.Client(bearer_token, wait_on_rate_limit=True)
frana_tweets = []
for response in tweepy.Paginator(client.search_all_tweets,
query = 'frana OR smottamento OR scivolamento OR crollo OR dissesto lang:it place_country:IT',
user_fields = ['username','public_metrics','description','location','created_at','entities','url','verified'],
tweet_fields = ['created_at','geo','entities','lang','non_public_metrics','public_metrics','source'],
place_fields = ['country','place_type'],
expansions = ['author_id','geo.place_id'],
start_time = '2019-11-01T00:00:00Z',
end_time = '2019-11-30T23:59:59Z',
max_results=500):
time.sleep(1)
frana_tweets.append(response)
result = []
user_dict = {}
place_dict = {}
tweet_dict = {}
# Loop through each response object
for response in frana_tweets:
# Take all of the users, and put them into a dictionary of dictionaries with the info we want to keep
for user in response.includes['users']:
user_dict[user.id] = {'name':user.name,
'username': user.username,
'created_at': user.created_at,
'description': user.description,
'entities': user.entities,
'location': user.location,
'pinned_tweet_id':user.pinned_tweet_id,
'protected':user.protected,
'followers_count': user.public_metrics['followers_count'],
'following_count': user.public_metrics['following_count'],
'tweet_count': user.public_metrics['tweet_count'],
'listed_count': user.public_metrics['listed_count'],
'url':user.url,
'verified':user.verified
}
for place in response.includes['places']:
place_dict[place.id] = {'geo_id':place.id,
'full_name':place.full_name,
'country': place.country,
'place_type': place.place_type
}
for tweet in response.data:
# For each tweet, find the author's information
author_info = user_dict[tweet.author_id]
geo_info = place_dict[place.id]
# Put all of the information we want to keep in a single dictionary for each tweet
result.append({'author_id': tweet.author_id,
'name':author_info['name'],
'username': author_info['username'],
'author_created_at': author_info['created_at'],
'author_description': author_info['description'],
'author_entities': author_info['entities'],
'author_location': author_info['location'],
'pinned_tweet_id':author_info['pinned_tweet_id'],
'protected':author_info['protected'],
'author_followers': author_info['followers_count'],
'author_following':author_info['following_count'],
'author_tweet_count': author_info['tweet_count'],
'author_listed_count': author_info['listed_count'],
'author_url': author_info['url'],
'author_verified': author_info['verified'],
'id_text':tweet.id, #identifica il tweet
'text': tweet.text,
'created_at': tweet.created_at,
'lang':tweet.lang,
'geo':tweet.geo,
'entities':tweet.entities,
'retweets': tweet.public_metrics['retweet_count'],
'replies': tweet.public_metrics['reply_count'],
'likes': tweet.public_metrics['like_count'],
'quote_count': tweet.public_metrics['quote_count'],
'non_public_metrics':tweet.non_public_metrics,
#'in_reply_to_user_id':tweet.in_reply_to_user_id,
'source':tweet.source,
'geo_id':geo_info['geo_id'],
'full_name':geo_info['full_name'],
'country': geo_info['country'],
'place_type': geo_info['place_type']
})
# Change this list of dictionaries into a dataframe
df4 = pd.DataFrame(result)
But it didn't get all data from Twitter. I mean, some tweets haven't been extracted. Why? I have tried another code, but I have the same problem:
import requests
import os
import json
import twitter_authentication as config
import time
import pandas as pd
# Save your bearer token in a file called twitter_authentication.py in this directory
# Should look like this:
bearer_token = 'name_Bearer_token'
bearer_token = config.bearer_token
query = 'frana OR smottamento OR scivolamento OR crollo OR dissesto lang:it'
out_file = 'raw_tweets.txt'
search_url = "https://api.twitter.com/2/tweets/search/all"
# Optional params: start_time,end_time,since_id,until_id,max_results,next_token,
# expansions,tweet.fields,media.fields,poll.fields,place.fields,user.fields
query_params = {'query': query,
'start_time': '2019-11-01T00:00:00Z',
'end_time':'2019-11-30T23:59:59Z',
'tweet.fields': 'id,text,author_id,in_reply_to_user_id,geo,conversation_id,created_at,lang,public_metrics,referenced_tweets,reply_settings,source',
'user.fields': 'id,name,username,created_at,description,public_metrics,verified',
'expansions': 'author_id,geo.place_id',
'place.fields': 'full_name,id,country,country_code,geo,name,place_type',
'max_results': 500
}
def create_headers(bearer_token):
headers = {"Authorization": "Bearer {}".format(bearer_token)}
return headers
def connect_to_endpoint(url, headers, params, next_token = None):
if next_token:
params['next_token'] = next_token
response = requests.request("GET", search_url, headers=headers, params=params)
time.sleep(3.1)
print(response.status_code)
if response.status_code != 1000:
raise Exception(response.status_code, response.text)
return response.json()
def get_tweets(num_tweets, output_fh):
next_token = None
tweets_stored = 0
while tweets_stored < num_tweets:
headers = create_headers(bearer_token)
json_response = connect_to_endpoint(search_url, headers, query_params, next_token)
if json_response['meta']['result_count'] == 0:
break
author_dict = {x['id']: x['username']
for x in json_response['includes']['users']}
for tweet in json_response['data']:
try:
tweet['username'] = author_dict[tweet['author_id']],
#'username': author_info['username'],
tweet['id']=tweet['entities'],
except KeyError:
print(f"No data for {tweet['author_id']}")
output_fh.write(json.dumps(tweet) + '\n')
tweets_stored += 1
try:
next_token = json_response['meta']['next_token']
except KeyError:
break
return None
def main():
with open(out_file, 'w') as f:
get_tweets(1000, f)
main()
tweets = []
with open(out_file, 'r') as f:
for row in f.readlines():
tweet = json.loads(row)
tweets.append(tweet)
tweets[0]
df4 = pd.DataFrame(tweets)
python
extract
data-mining
0 Answers
Your Answer