R Statistical Computing Expert

Expert guidance for R programming, statistical analysis, data visualization, and data science.

Core Concepts

R Fundamentals

Vectors and data frames
Factors and lists
Functions and apply family
Packages and libraries
R Markdown
Tidyverse ecosystem

Statistical Analysis

Descriptive statistics
Hypothesis testing
Regression analysis
ANOVA
Time series analysis
Machine learning

Data Visualization

ggplot2
Base R graphics
Interactive plots (plotly)
Statistical charts
Maps and spatial data

R Basics

# Vectors
numbers <- c(1, 2, 3, 4, 5)
names <- c("Alice", "Bob", "Charlie")

# Data frames
df <- data.frame(
  id = 1:5,
  name = c("Alice", "Bob", "Charlie", "David", "Eve"),
  age = c(25, 30, 35, 28, 32),
  salary = c(50000, 60000, 55000, 52000, 58000)
)

# Subsetting
df[df$age > 30, ]  # Rows where age > 30
df[, c("name", "age")]  # Select columns

# Functions
calculate_mean <- function(x) {
  sum(x) / length(x)
}

# Apply family
sapply(df$age, function(x) x * 2)
lapply(list(1:5, 6:10), sum)

# Control structures
if (mean(df$age) > 30) {
  print("Average age is above 30")
} else {
  print("Average age is 30 or below")
}

# Loops
for (i in 1:nrow(df)) {
  print(df$name[i])
}

Tidyverse

library(dplyr)
library(tidyr)
library(stringr)

# dplyr operations
df %>%
  filter(age > 28) %>%
  select(name, age, salary) %>%
  mutate(
    salary_bonus = salary * 1.1,
    age_group = case_when(
      age < 30 ~ "Young",
      age < 35 ~ "Mid-career",
      TRUE ~ "Senior"
    )
  ) %>%
  arrange(desc(salary)) %>%
  group_by(age_group) %>%
  summarise(
    count = n(),
    avg_salary = mean(salary),
    total_salary = sum(salary)
  )

# Reshaping data
wide_data <- data.frame(
  id = 1:3,
  year_2021 = c(100, 200, 150),
  year_2022 = c(120, 210, 160)
)

# Wide to long
long_data <- wide_data %>%
  pivot_longer(
    cols = starts_with("year"),
    names_to = "year",
    values_to = "value",
    names_prefix = "year_"
  )

# Long to wide
wide_again <- long_data %>%
  pivot_wider(
    names_from = year,
    values_from = value,
    names_prefix = "year_"
  )

# String operations
df %>%
  mutate(
    name_upper = str_to_upper(name),
    name_length = str_length(name),
    first_letter = str_sub(name, 1, 1)
  )

# Joining data
df1 <- data.frame(id = 1:3, value1 = c("A", "B", "C"))
df2 <- data.frame(id = 2:4, value2 = c("X", "Y", "Z"))

inner_join(df1, df2, by = "id")
left_join(df1, df2, by = "id")
full_join(df1, df2, by = "id")

ggplot2 Visualization

library(ggplot2)

# Basic scatter plot
ggplot(df, aes(x = age, y = salary)) +
  geom_point(size = 3, color = "blue") +
  geom_smooth(method = "lm", se = TRUE) +
  labs(
    title = "Age vs Salary",
    x = "Age (years)",
    y = "Salary ($)"
  ) +
  theme_minimal()

# Bar plot with facets
ggplot(df, aes(x = name, y = salary, fill = age_group)) +
  geom_col() +
  facet_wrap(~ age_group) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Box plot
ggplot(df, aes(x = age_group, y = salary)) +
  geom_boxplot(fill = "lightblue") +
  geom_jitter(width = 0.2, alpha = 0.5)

# Histogram with density
ggplot(df, aes(x = salary)) +
  geom_histogram(aes(y = ..density..), bins = 10, fill = "steelblue") +
  geom_density(color = "red", size = 1)

# Time series
ggplot(time_series_df, aes(x = date, y = value)) +
  geom_line(color = "darkgreen") +
  geom_point() +
  scale_x_date(date_breaks = "1 month", date_labels = "%b %Y") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Statistical Analysis

# Descriptive statistics
summary(df)
mean(df$age)
median(df$salary)
sd(df$age)
var(df$salary)
quantile(df$age, probs = c(0.25, 0.5, 0.75))

# Correlation
cor(df$age, df$salary)
cor.test(df$age, df$salary)

# T-test
t.test(df$salary ~ df$gender)

# ANOVA
model <- aov(salary ~ age_group, data = df)
summary(model)
TukeyHSD(model)

# Linear regression
lm_model <- lm(salary ~ age + experience, data = df)
summary(lm_model)

# Predictions
new_data <- data.frame(age = c(30, 35), experience = c(5, 8))
predict(lm_model, new_data, interval = "confidence")

# Multiple regression
multi_model <- lm(salary ~ age + experience + education, data = df)
summary(multi_model)

# Check assumptions
par(mfrow = c(2, 2))
plot(multi_model)

# Logistic regression
logit_model <- glm(outcome ~ age + salary,
                   data = df,
                   family = binomial(link = "logit"))
summary(logit_model)

Time Series Analysis

library(forecast)

# Create time series
ts_data <- ts(data, start = c(2020, 1), frequency = 12)

# Decomposition
decomposed <- decompose(ts_data)
plot(decomposed)

# ARIMA model
auto_arima <- auto.arima(ts_data)
summary(auto_arima)

# Forecasting
forecast_result <- forecast(auto_arima, h = 12)
plot(forecast_result)

# Accuracy metrics
accuracy(forecast_result)

Machine Learning

library(caret)
library(randomForest)

# Split data
set.seed(123)
train_index <- createDataPartition(df$outcome, p = 0.8, list = FALSE)
train_data <- df[train_index, ]
test_data <- df[-train_index, ]

# Train model
rf_model <- randomForest(
  outcome ~ .,
  data = train_data,
  ntree = 500,
  importance = TRUE
)

# Predictions
predictions <- predict(rf_model, test_data)

# Confusion matrix
confusionMatrix(predictions, test_data$outcome)

# Feature importance
importance(rf_model)
varImpPlot(rf_model)

# Cross-validation
train_control <- trainControl(
  method = "cv",
  number = 10,
  savePredictions = TRUE
)

cv_model <- train(
  outcome ~ .,
  data = train_data,
  method = "rf",
  trControl = train_control
)

print(cv_model)

R Markdown

---
title: "Analysis Report"
author: "Data Scientist"
date: "`r Sys.Date()`"
output:
  html_document:
    toc: true
    toc_float: true
    code_folding: hide
---

## Introduction

This analysis explores the relationship between variables.

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE)
library(tidyverse)

Data Loading

df <- read.csv("data.csv")
head(df)

Visualization

ggplot(df, aes(x = x, y = y)) +
  geom_point() +
  theme_minimal()

Results

The analysis shows that r cor(df$x, df$y) correlation.


## Data Import/Export

```r
# CSV
df <- read.csv("data.csv")
write.csv(df, "output.csv", row.names = FALSE)

# Excel
library(readxl)
library(writexl)
df <- read_excel("data.xlsx", sheet = "Sheet1")
write_xlsx(df, "output.xlsx")

# JSON
library(jsonlite)
df <- fromJSON("data.json")
write_json(df, "output.json")

# Database
library(DBI)
library(RSQLite)
con <- dbConnect(SQLite(), "database.db")
df <- dbReadTable(con, "table_name")
dbWriteTable(con, "new_table", df)
dbDisconnect(con)

# Web APIs
library(httr)
response <- GET("https://api.example.com/data")
data <- content(response, as = "parsed")

Best Practices

Code Style

Use <- for assignment
Follow tidyverse style guide
Write functions for repeated code
Use meaningful variable names
Comment complex operations
Use %>% pipe for readability

Data Analysis

Always explore data first
Check for missing values
Validate assumptions
Use visualization
Document your analysis
Make analysis reproducible

Performance

Vectorize operations
Use data.table for large data
Avoid growing objects in loops
Profile code with Rprof()
Use parallel processing
Cache expensive computations

Anti-Patterns

❌ Growing vectors in loops ❌ Not setting random seed ❌ Ignoring NA values ❌ Using attach() ❌ Not documenting code ❌ Hardcoding file paths ❌ Not checking assumptions

Resources

R Documentation: https://www.r-project.org/
Tidyverse: https://www.tidyverse.org/
ggplot2: https://ggplot2.tidyverse.org/
R for Data Science (book): https://r4ds.had.co.nz/
CRAN Task Views: https://cran.r-project.org/web/views/

r-expert

R Statistical Computing Expert

Core Concepts

R Fundamentals

Statistical Analysis

Data Visualization

R Basics

Tidyverse

ggplot2 Visualization

Statistical Analysis

Time Series Analysis

Machine Learning

R Markdown

Data Loading

Visualization

Results

Best Practices

Code Style

Data Analysis

Performance

Anti-Patterns

Resources