1 year ago

#386258

test-img

Carola

Add a new middle row in a dataframe using Python's Iterable and Tuple library

Be the following dataframe:

country_ID ID date direction
ESP 0 2022-01-02 IN
UK 0 2022-01-05 OUT
ESP 1 2022-03-02 IN
UK 2 2022-02-05 IN
UK 2 2022-02-04 OUT
ESP 3 2022-02-10 IN

I implement the following functionalities using the Python libraries Iterable and Tuple. Functions (function1_ESP, function2_ESP and function2_UK have similar structure to function1_UK, returning data dependent on conditions, so consider them as a black box of the problem).

from typing import Iterable, Tuple

def function1_UK(country_ID: str, date_old:pd.Timestamp, date_now:pd.Timestamp):
    town1, town2, town3 = (0, 0, 0)
    time_delta = pd.NaT
    flag = False
    if country_ID == 'ESP':
        time_delta = date_now-date_old
        flag = True
        error = 'OK'
    else:
        error = 'ERROR_1'  
        town1 += 1
        town2 += 1

    return (town1, town2, town3, time_delta, f'{error}'), flag

# function1_ESP,function2_ESP and function2_UK have similar structure.

def behaviour(df: pd.DataFrame)->Iterable[Tuple[int,int,int,pd.Timestamp,str]]:
    # Implements logic for each country
    direction_ESP = {
        ('IN', 'IN'): function1_ESP,
        ('OUT', 'OUT'): function1_ESP,
        ('IN', 'OUT'): function2_UK,
        ('OUT', 'IN'): function2_UK,
    }
    direction_UK = {
        ('IN', 'IN'): function1_UK,
        ('OUT', 'OUT'): function1_UK,
        ('IN', 'OUT'): function2_UK,
        ('OUT', 'IN'): function2_UK,
    }
    # List of ID to store the first record for each ID
    list_ids = []
    prev = None
    # Iterate on the complete dataset
    for row in df.itertuples():
        # Apply the behaviour to the first record
        if row.ID not in list_ids:
            yield (0,0,0,pd.NaT,'STARTING')
            list_ids.append(row.ID)
            prev = row
        # If it is not the first time is stored
        else:
            # Behaviour for ESP country_ID
            if row.country_ID == 'ESP':
                conditional_func = (direction_ESP[(row.direction, prev.direction)])
                insert_row, flag = conditional_func(prev.country_ID, prev.date, row.date)
                yield (insert_row)
                # If flag == True add a new extra row to the dataframe at that position.
                if flag:
                    # The new row has: 
                    # country_ID = 'NEW COUNTRY ESP'
                    # ID = row.ID
                    # date = row.date
                    # direction = 'OUT'
                    # and these new values returned by yield(0,0,0,pd.NaT,'TEST ESP')
            # Behaviour for UK ID
            else:
                conditional_func = (direction_UK[(row.direction, prev.direction)])
                insert_row, flag = conditional_func(prev.country_ID, prev.date, row.date)
                yield (insert_row)
                if flag:
                    # The new row has: 
                    # country_ID = 'NEW COUNTRY UK'
                    # ID = row.ID
                    # date = row.date
                    # direction = 'IN'
                    # and these new values returned by yield(0,0,1,pd.NaT,'TEST UK')
            
            prev = row

def main():    
    
   df[['visit_town1', 'visit_town2', 'visit_town3', 'time', 'error']]=(tuple(behaviour(df)))

if __name__ == '__main__':  
    main()

I want that in each case, if the returned flag is True, a new row with specific values is added just after the row being iterated over. That is, each time this happens, the length of the DataFrame would be increased by 1.

Thank you for your help.

python

pandas

tuples

iterable

0 Answers

Your Answer

Accepted video resources