Data mining inside twitter but getting less data

2 years ago

#376749

Rachele Franceschini

I would like to get tweets using specific keywords, within specific time, using specific language. I have all credentials from Twitter (Developer Portal-key,token,bearer). I have tried one extraction using this code.

import tweepy

from twitter_authentication import bearer_token
import time
import pandas as pd

client = tweepy.Client(bearer_token, wait_on_rate_limit=True)

frana_tweets = []
for response in tweepy.Paginator(client.search_all_tweets, 
                                 query = 'frana OR smottamento OR scivolamento OR crollo OR dissesto lang:it place_country:IT',
                                 user_fields = ['username','public_metrics','description','location','created_at','entities','url','verified'],
                                 tweet_fields = ['created_at','geo','entities','lang','non_public_metrics','public_metrics','source'],
                                 place_fields = ['country','place_type'],
                                 expansions = ['author_id','geo.place_id'],  
                                 start_time = '2019-11-01T00:00:00Z',
                                 end_time = '2019-11-30T23:59:59Z', 
                                 max_results=500):
    time.sleep(1)
    frana_tweets.append(response)
            
result = []
user_dict = {}
place_dict = {}
tweet_dict = {}
# Loop through each response object
for response in frana_tweets:
    # Take all of the users, and put them into a dictionary of dictionaries with the info we want to keep
    for user in response.includes['users']:
        user_dict[user.id] = {'name':user.name,
                              'username': user.username,
                              'created_at': user.created_at,                                   
                              'description': user.description, 
                              'entities': user.entities, 
                              'location': user.location, 
                              'pinned_tweet_id':user.pinned_tweet_id,
                              'protected':user.protected,
                              'followers_count': user.public_metrics['followers_count'], 
                              'following_count': user.public_metrics['following_count'],                              
                              'tweet_count': user.public_metrics['tweet_count'],
                              'listed_count': user.public_metrics['listed_count'],
                              'url':user.url, 
                              'verified':user.verified 
                                }

    for place in response.includes['places']:
        place_dict[place.id] = {'geo_id':place.id,
                                'full_name':place.full_name,
                                'country': place.country, 
                                'place_type': place.place_type
                                 }
        
    for tweet in response.data:
        # For each tweet, find the author's information
        author_info = user_dict[tweet.author_id]
        geo_info =  place_dict[place.id]         
        # Put all of the information we want to keep in a single dictionary for each tweet
        result.append({'author_id': tweet.author_id,                            
                       'name':author_info['name'],
                       'username': author_info['username'],
                       'author_created_at': author_info['created_at'],
                       'author_description': author_info['description'],
                       'author_entities': author_info['entities'],
                       'author_location': author_info['location'],
                       'pinned_tweet_id':author_info['pinned_tweet_id'],
                       'protected':author_info['protected'],
                       'author_followers': author_info['followers_count'],
                       'author_following':author_info['following_count'],
                       'author_tweet_count': author_info['tweet_count'],                       
                       'author_listed_count': author_info['listed_count'],
                       'author_url': author_info['url'],
                       'author_verified': author_info['verified'],
                       
                       'id_text':tweet.id, #identifica il tweet
                       'text': tweet.text,
                       'created_at': tweet.created_at, 
                       'lang':tweet.lang,               
                       'geo':tweet.geo, 
                       'entities':tweet.entities, 
                       'retweets': tweet.public_metrics['retweet_count'], 
                       'replies': tweet.public_metrics['reply_count'],
                       'likes': tweet.public_metrics['like_count'],
                       'quote_count': tweet.public_metrics['quote_count'],
                       'non_public_metrics':tweet.non_public_metrics, 
                       #'in_reply_to_user_id':tweet.in_reply_to_user_id, 
                       'source':tweet.source, 
                      
                       'geo_id':geo_info['geo_id'],
                       'full_name':geo_info['full_name'], 
                       'country': geo_info['country'],
                       'place_type': geo_info['place_type'] 
                    })

# Change this list of dictionaries into a dataframe
df4 = pd.DataFrame(result)

But it didn't get all data from Twitter. I mean, some tweets haven't been extracted. Why? I have tried another code, but I have the same problem:

import requests
import os
import json
import twitter_authentication as config
import time
import pandas as pd

# Save your bearer token in a file called twitter_authentication.py in this directory
# Should look like this:
bearer_token = 'name_Bearer_token'

bearer_token = config.bearer_token
query = 'frana OR smottamento OR scivolamento OR crollo OR dissesto lang:it'
out_file = 'raw_tweets.txt'

search_url = "https://api.twitter.com/2/tweets/search/all"

# Optional params: start_time,end_time,since_id,until_id,max_results,next_token,
# expansions,tweet.fields,media.fields,poll.fields,place.fields,user.fields
query_params = {'query': query,
                'start_time': '2019-11-01T00:00:00Z',
                'end_time':'2019-11-30T23:59:59Z',
                'tweet.fields': 'id,text,author_id,in_reply_to_user_id,geo,conversation_id,created_at,lang,public_metrics,referenced_tweets,reply_settings,source',
                'user.fields': 'id,name,username,created_at,description,public_metrics,verified',
                'expansions': 'author_id,geo.place_id',
                'place.fields': 'full_name,id,country,country_code,geo,name,place_type',
                'max_results': 500
               }


def create_headers(bearer_token):
    headers = {"Authorization": "Bearer {}".format(bearer_token)}
    return headers
def connect_to_endpoint(url, headers, params, next_token = None):
    if next_token:
        params['next_token'] = next_token
    response = requests.request("GET", search_url, headers=headers, params=params)
    time.sleep(3.1)
    print(response.status_code)
    if response.status_code != 1000:
        raise Exception(response.status_code, response.text)
    return response.json()


def get_tweets(num_tweets, output_fh):
    next_token = None
    tweets_stored = 0
    while tweets_stored < num_tweets:
        headers = create_headers(bearer_token)
        json_response = connect_to_endpoint(search_url, headers, query_params, next_token)
        if json_response['meta']['result_count'] == 0:
            break
        author_dict = {x['id']: x['username']
                       for x in json_response['includes']['users']}
        for tweet in json_response['data']:
            try:
                tweet['username'] = author_dict[tweet['author_id']],
                #'username': author_info['username'],
                tweet['id']=tweet['entities'],
                                
            except KeyError:
                print(f"No data for {tweet['author_id']}")
            output_fh.write(json.dumps(tweet) + '\n')
            tweets_stored += 1
        try:
            next_token = json_response['meta']['next_token']
        except KeyError:
            break
    return None



def main():
    with open(out_file, 'w') as f:
        get_tweets(1000, f)



main()

tweets = []
with open(out_file, 'r') as f:
    for row in f.readlines():
        tweet = json.loads(row)
        tweets.append(tweet)
tweets[0]
df4 = pd.DataFrame(tweets)

python

twitter

extract

data-mining

0 Answers

Your Answer

Posts

Questions

Blogs