Python and R together

Author

Klaus G.

Published

September 21, 2024

Starting with R code

library(reticulate)
library(ggplot2)
# github Actions workflow is with the current publish.yml no setting up a venv
# **uncomment** the following line if you run it in VSCode
#use_virtualenv("./.venv", required=TRUE)  # tells R reticulate to use this Python virtual envivonment

df_r <- read.csv("./data/covid.csv")
nrow(df_r)

[1] 20780

Some Python code

import pandas as pd

df_python_covid = pd.read_csv("./data/covid.csv")
df_python_covid.shape

(20780, 6)

df_python_covid.isna().sum()

date                   0
state               2153
tests                  0
cases                  0
hospitalizations       0
deaths                 0
dtype: int64

df_python_covid.dropna(inplace=True)
df_python_covid.isna().sum()

date                0
state               0
tests               0
cases               0
hospitalizations    0
deaths              0
dtype: int64

df_python_covid_agg = df_python_covid.groupby('state')['deaths'].sum()
df_python_covid_agg.head(5)

state
AK      305
AL    10148
AR     5319
AZ    16328
CA    54124
Name: deaths, dtype: int64

df_python_covid_agg.index

Index(['AK', 'AL', 'AR', 'AZ', 'CA', 'CO', 'CT', 'DE', 'FL', 'GA', 'HI', 'IA',
       'ID', 'IL', 'IN', 'KS', 'KY', 'LA', 'MA', 'MD', 'ME', 'MI', 'MN', 'MO',
       'MS', 'MT', 'NC', 'ND', 'NE', 'NH', 'NJ', 'NM', 'NV', 'NY', 'OH', 'OK',
       'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX', 'UT', 'VA', 'VT', 'WA', 'WI',
       'WV', 'WY'],
      dtype='object', name='state')

How to exchange dataframes between Python and R

# R code
nrow(reticulate::py$df_python_covid)

[1] 18627

back_to_r = reticulate::py$df_python_covid_agg
head(back_to_r, 5)

   AK    AL    AR    AZ    CA 
  305 10148  5319 16328 54124

# Python code
type(r.df_r)

<class 'pandas.core.frame.DataFrame'>

Multi-index example

mpg - miles per gallon - is a dataset included in R

# R code
head(mpg,5)

# A tibble: 5 × 11
  manufacturer model displ  year   cyl trans      drv     cty   hwy fl    class 
  <chr>        <chr> <dbl> <int> <int> <chr>      <chr> <int> <int> <chr> <chr> 
1 audi         a4      1.8  1999     4 auto(l5)   f        18    29 p     compa…
2 audi         a4      1.8  1999     4 manual(m5) f        21    29 p     compa…
3 audi         a4      2    2008     4 manual(m6) f        20    31 p     compa…
4 audi         a4      2    2008     4 auto(av)   f        21    30 p     compa…
5 audi         a4      2.8  1999     6 auto(l5)   f        16    26 p     compa…

# Python code
df_mpg = r.mpg
#df_mpg.shape
#df_mpg.info()
df_mpg_grouped = df_mpg.groupby(['manufacturer','class'])['year'].value_counts()
df_mpg_grouped_reset_index = df_mpg_grouped.reset_index()
df_mpg_grouped.head(5)

manufacturer  class    year
audi          compact  1999    8
                       2008    7
              midsize  2008    2
                       1999    1
chevrolet     2seater  2008    3
Name: count, dtype: int64

df_mpg_grouped_reset_index.head(5)

  manufacturer    class  year  count
0         audi  compact  1999      8
1         audi  compact  2008      7
2         audi  midsize  2008      2
3         audi  midsize  1999      1
4    chevrolet  2seater  2008      3

# R code
head(reticulate::py$df_mpg_grouped, 5)  # multi-index

     ('audi', 'compact', 1999)      ('audi', 'compact', 2008) 
                             8                              7 
     ('audi', 'midsize', 2008)      ('audi', 'midsize', 1999) 
                             2                              1 
('chevrolet', '2seater', 2008) 
                             3

head(reticulate::py$df_mpg_grouped_reset_index, 5)  # collapsed multi-index

  manufacturer   class year count
1         audi compact 1999     8
2         audi compact 2008     7
3         audi midsize 2008     2
4         audi midsize 1999     1
5    chevrolet 2seater 2008     3

#reticulate::py_last_error()

Back to tutorial

Back to the tutorial