skills/gptomics/bioskills/bio-pdb-geometric-analysis

bio-pdb-geometric-analysis

SKILL.md

Geometric Analysis

Measure distances, angles, and dihedrals. Superimpose structures and calculate RMSD. Find neighbor atoms and contacts.

Required Imports

from Bio.PDB import PDBParser, NeighborSearch, Superimposer
from Bio.PDB import calc_angle, calc_dihedral
import numpy as np

Distance Between Atoms

from Bio.PDB import PDBParser

parser = PDBParser(QUIET=True)
structure = parser.get_structure('protein', 'protein.pdb')

chain = structure[0]['A']
atom1 = chain[100]['CA']
atom2 = chain[200]['CA']

# Direct subtraction gives distance
distance = atom1 - atom2
print(f'Distance: {distance:.2f} Angstroms')

# Or use numpy
import numpy as np
distance = np.linalg.norm(atom1.coord - atom2.coord)

Distance Matrix

import numpy as np
from Bio.PDB import PDBParser

parser = PDBParser(QUIET=True)
structure = parser.get_structure('protein', 'protein.pdb')

ca_atoms = [r['CA'] for r in structure.get_residues() if r.has_id('CA') and r.id[0] == ' ']
n = len(ca_atoms)

dist_matrix = np.zeros((n, n))
for i in range(n):
    for j in range(i + 1, n):
        dist = ca_atoms[i] - ca_atoms[j]
        dist_matrix[i, j] = dist
        dist_matrix[j, i] = dist

print(f'Distance matrix shape: {dist_matrix.shape}')

Angle Between Three Atoms

from Bio.PDB import PDBParser, calc_angle

parser = PDBParser(QUIET=True)
structure = parser.get_structure('protein', 'protein.pdb')

residue = structure[0]['A'][100]
n = residue['N']
ca = residue['CA']
c = residue['C']

# calc_angle takes Vector objects (atom.coord returns array, use atom.get_vector())
angle_rad = calc_angle(n.get_vector(), ca.get_vector(), c.get_vector())
angle_deg = np.degrees(angle_rad)
print(f'N-CA-C angle: {angle_deg:.1f} degrees')

Dihedral Angles

from Bio.PDB import PDBParser, calc_dihedral
import numpy as np

parser = PDBParser(QUIET=True)
structure = parser.get_structure('protein', 'protein.pdb')

chain = structure[0]['A']

# Calculate phi angle (C-N-CA-C)
res_prev = chain[99]
res_curr = chain[100]

phi = calc_dihedral(
    res_prev['C'].get_vector(),
    res_curr['N'].get_vector(),
    res_curr['CA'].get_vector(),
    res_curr['C'].get_vector()
)
print(f'Phi: {np.degrees(phi):.1f} degrees')

# Calculate psi angle (N-CA-C-N)
res_next = chain[101]
psi = calc_dihedral(
    res_curr['N'].get_vector(),
    res_curr['CA'].get_vector(),
    res_curr['C'].get_vector(),
    res_next['N'].get_vector()
)
print(f'Psi: {np.degrees(psi):.1f} degrees')

Ramachandran Angles for All Residues

from Bio.PDB import PDBParser, PPBuilder, calc_dihedral
import numpy as np

parser = PDBParser(QUIET=True)
structure = parser.get_structure('protein', 'protein.pdb')

ppb = PPBuilder()
phi_psi = []

for pp in ppb.build_peptides(structure):
    angles = pp.get_phi_psi_list()
    for residue, (phi, psi) in zip(pp, angles):
        if phi is not None and psi is not None:
            phi_psi.append((residue.resname, np.degrees(phi), np.degrees(psi)))

for name, phi, psi in phi_psi[:10]:
    print(f'{name}: phi={phi:.1f}, psi={psi:.1f}')

Finding Neighbor Atoms

from Bio.PDB import PDBParser, NeighborSearch

parser = PDBParser(QUIET=True)
structure = parser.get_structure('protein', 'protein.pdb')

# Build search tree from all atoms
all_atoms = list(structure.get_atoms())
ns = NeighborSearch(all_atoms)

# Find atoms within radius of a point
center = structure[0]['A'][100]['CA'].coord
radius = 5.0  # Angstroms
neighbors = ns.search(center, radius)
print(f'Found {len(neighbors)} atoms within {radius}A')

# Find atoms within radius of another atom
ca_atom = structure[0]['A'][100]['CA']
neighbors = ns.search(ca_atom.coord, 5.0)

Finding Residue Contacts

from Bio.PDB import PDBParser, NeighborSearch

parser = PDBParser(QUIET=True)
structure = parser.get_structure('protein', 'protein.pdb')

all_atoms = list(structure.get_atoms())
ns = NeighborSearch(all_atoms)

# Find all atom pairs within distance
contact_distance = 4.0
contacts = ns.search_all(contact_distance, level='R')  # R = residue level

print(f'Found {len(contacts)} residue contacts within {contact_distance}A')
for res1, res2 in contacts[:10]:
    print(f'  {res1.resname}{res1.id[1]} - {res2.resname}{res2.id[1]}')

Contact Levels

from Bio.PDB import NeighborSearch

# search_all returns pairs at specified level
# Level codes: A=atom, R=residue, C=chain, M=model, S=structure

atom_contacts = ns.search_all(4.0, level='A')      # Atom pairs
residue_contacts = ns.search_all(4.0, level='R')  # Residue pairs
chain_contacts = ns.search_all(10.0, level='C')   # Chain pairs

Superimposing Structures

from Bio.PDB import PDBParser, Superimposer

parser = PDBParser(QUIET=True)
ref_structure = parser.get_structure('ref', 'reference.pdb')
mobile_structure = parser.get_structure('mobile', 'mobile.pdb')

# Get CA atoms from both structures
ref_atoms = [r['CA'] for r in ref_structure.get_residues() if r.has_id('CA') and r.id[0] == ' ']
mobile_atoms = [r['CA'] for r in mobile_structure.get_residues() if r.has_id('CA') and r.id[0] == ' ']

# Ensure same number of atoms
n = min(len(ref_atoms), len(mobile_atoms))
ref_atoms = ref_atoms[:n]
mobile_atoms = mobile_atoms[:n]

# Superimpose
sup = Superimposer()
sup.set_atoms(ref_atoms, mobile_atoms)
print(f'RMSD before: {sup.rms:.2f} Angstroms')

# Apply transformation to all atoms in mobile structure
sup.apply(mobile_structure.get_atoms())

Calculating RMSD

from Bio.PDB import PDBParser, Superimposer
import numpy as np

parser = PDBParser(QUIET=True)
struct1 = parser.get_structure('s1', 'structure1.pdb')
struct2 = parser.get_structure('s2', 'structure2.pdb')

atoms1 = [r['CA'] for r in struct1.get_residues() if r.has_id('CA') and r.id[0] == ' ']
atoms2 = [r['CA'] for r in struct2.get_residues() if r.has_id('CA') and r.id[0] == ' ']

# Using Superimposer (with alignment)
sup = Superimposer()
sup.set_atoms(atoms1, atoms2)
rmsd_aligned = sup.rms
print(f'RMSD (aligned): {rmsd_aligned:.2f} A')

# Raw RMSD (no alignment)
coords1 = np.array([a.coord for a in atoms1])
coords2 = np.array([a.coord for a in atoms2])
rmsd_raw = np.sqrt(np.mean(np.sum((coords1 - coords2) ** 2, axis=1)))
print(f'RMSD (raw): {rmsd_raw:.2f} A')

CEAligner for Dissimilar Structures

from Bio.PDB import PDBParser, CEAligner

parser = PDBParser(QUIET=True)
ref = parser.get_structure('ref', 'reference.pdb')
mobile = parser.get_structure('mobile', 'query.pdb')

# CE alignment works for structures with different sequences
aligner = CEAligner()
aligner.set_reference(ref)
aligner.align(mobile)

print(f'RMSD: {aligner.rms:.2f} A')

# CEAligner automatically modifies mobile structure coordinates

Center of Mass

from Bio.PDB import PDBParser
import numpy as np

parser = PDBParser(QUIET=True)
structure = parser.get_structure('protein', 'protein.pdb')

# Unweighted center (geometric center)
coords = np.array([a.coord for a in structure.get_atoms()])
center = coords.mean(axis=0)
print(f'Geometric center: {center}')

# Mass-weighted center (approximate - uses atom count)
# For accurate mass-weighted, use element masses
atoms = list(structure.get_atoms())
masses = {'C': 12.0, 'N': 14.0, 'O': 16.0, 'S': 32.0, 'H': 1.0}
total_mass = 0
weighted_sum = np.zeros(3)
for atom in atoms:
    mass = masses.get(atom.element, 12.0)
    weighted_sum += mass * atom.coord
    total_mass += mass
center_of_mass = weighted_sum / total_mass
print(f'Center of mass: {center_of_mass}')

Radius of Gyration

import numpy as np
from Bio.PDB import PDBParser

parser = PDBParser(QUIET=True)
structure = parser.get_structure('protein', 'protein.pdb')

coords = np.array([a.coord for a in structure.get_atoms()])
center = coords.mean(axis=0)

# Radius of gyration
rg = np.sqrt(np.mean(np.sum((coords - center) ** 2, axis=1)))
print(f'Radius of gyration: {rg:.2f} A')

Finding Surface Residues

from Bio.PDB import PDBParser, NeighborSearch

parser = PDBParser(QUIET=True)
structure = parser.get_structure('protein', 'protein.pdb')

# Simple approach: residues with few neighbors
all_atoms = list(structure.get_atoms())
ns = NeighborSearch(all_atoms)

surface_residues = []
for residue in structure.get_residues():
    if residue.id[0] != ' ':
        continue
    if not residue.has_id('CA'):
        continue

    ca = residue['CA']
    neighbors = ns.search(ca.coord, 10.0, level='R')
    if len(neighbors) < 15:  # Threshold for surface
        surface_residues.append(residue)

print(f'Surface residues: {len(surface_residues)}')

Vector Operations

from Bio.PDB import PDBParser
from Bio.PDB.vectors import Vector, rotaxis

parser = PDBParser(QUIET=True)
structure = parser.get_structure('protein', 'protein.pdb')

# Get vectors from atoms
atom1 = structure[0]['A'][100]['CA']
atom2 = structure[0]['A'][101]['CA']

v1 = atom1.get_vector()
v2 = atom2.get_vector()

# Vector operations
diff = v2 - v1
print(f'Distance vector: {diff}')
print(f'Length: {diff.norm():.2f}')

# Normalize
unit = diff.normalized()

# Cross product
cross = v1 ** v2

# Dot product
dot = v1 * v2

Chi Angles (Sidechain Dihedrals)

from Bio.PDB import PDBParser, calc_dihedral
import numpy as np

parser = PDBParser(QUIET=True)
structure = parser.get_structure('protein', 'protein.pdb')

# Chi1 angle for a residue (N-CA-CB-CG)
residue = structure[0]['A'][100]

if residue.has_id('CB') and residue.has_id('CG'):
    chi1 = calc_dihedral(
        residue['N'].get_vector(),
        residue['CA'].get_vector(),
        residue['CB'].get_vector(),
        residue['CG'].get_vector()
    )
    print(f'Chi1: {np.degrees(chi1):.1f} degrees')

Solvent Accessible Surface Area (SASA)

from Bio.PDB import PDBParser
from Bio.PDB.SASA import ShrakeRupley

parser = PDBParser(QUIET=True)
structure = parser.get_structure('protein', 'protein.pdb')

# Calculate SASA using Shrake-Rupley algorithm
sr = ShrakeRupley()
sr.compute(structure, level='S')  # S=structure, M=model, C=chain, R=residue, A=atom

# Access total SASA
print(f'Total SASA: {structure.sasa:.2f} A^2')

# Access per-residue SASA
for residue in structure.get_residues():
    if hasattr(residue, 'sasa'):
        print(f'{residue.resname}{residue.id[1]}: {residue.sasa:.2f} A^2')

SASA with Custom Parameters

from Bio.PDB import PDBParser
from Bio.PDB.SASA import ShrakeRupley

parser = PDBParser(QUIET=True)
structure = parser.get_structure('protein', 'protein.pdb')

# Higher precision (more points = slower but more accurate)
sr = ShrakeRupley(probe_radius=1.4, n_points=960)
sr.compute(structure, level='R')

# Per-atom SASA
for atom in structure.get_atoms():
    print(f'{atom.name}: {atom.sasa:.2f} A^2')

Identifying Buried vs Exposed Residues

from Bio.PDB import PDBParser
from Bio.PDB.SASA import ShrakeRupley

parser = PDBParser(QUIET=True)
structure = parser.get_structure('protein', 'protein.pdb')

sr = ShrakeRupley()
sr.compute(structure, level='R')

buried = []
exposed = []
for residue in structure.get_residues():
    if residue.id[0] != ' ':
        continue
    if hasattr(residue, 'sasa'):
        if residue.sasa < 10.0:  # Threshold in A^2
            buried.append(residue)
        else:
            exposed.append(residue)

print(f'Buried: {len(buried)}, Exposed: {len(exposed)}')

Related Skills

  • structure-io - Parse and write structure files
  • structure-navigation - Access chains, residues, atoms
  • structure-modification - Transform coordinates, modify structures
  • alignment - Sequence alignment for structure comparison
Weekly Installs
3
Installed on
windsurf2
trae2
opencode2
codex2
claude-code2
antigravity2