Spring 2026 — Expression-Based Filtering
# bcftools view -i: filter with INFO field expression
# Streams through VCF once (no index needed, no targets file)
# R2 and MAF are embedded in Michigan dose VCF INFO field
# Much faster than old -T targets approach
for chr in $(seq 1 22); do
bcftools view -i 'INFO/R2>=0.80 && INFO/MAF>=0.001' \
-Oz -o filtered/chr${chr}.clean.vcf.gz \
chr${chr}.dose.vcf.gz
bcftools index -f filtered/chr${chr}.clean.vcf.gz
done
# Concatenate all filtered chromosomes
$ bcftools concat filtered/chr{1..22}.clean.vcf.gz -Oz \
-o UZB_imputed_HQ.vcf.gz
$ bcftools index UZB_imputed_HQ.vcf.gz
# Convert to PLINK binary with dosage
# plink2 --vcf: read VCF file
# dosage=DS: use DS field for allele dosage (Michigan default)
$ plink2 --vcf UZB_imputed_HQ.vcf.gz dosage=DS \
--make-bed \
--out UZB_imputed_HQ
# Heterozygosity check
# plink2 --het: compute per-sample F (inbreeding coefficient)
# Outlier threshold: |F| > 0.2
$ plink2 --bfile UZB_imputed_HQ \
--het \
--out het_check
# Remove outliers if any
$ awk 'NR>1 && ($6 > 0.2 || $6 < -0.2) {print $1, $2}' \
het_check.het > het_outliers.txt
$ plink2 --bfile UZB_imputed_HQ \
--remove het_outliers.txt \
--make-bed \
--out UZB_imputed_HQ_clean
Winter 2025 — Targets-File Filtering
Compute sample-level statistics
# vcftools --missing-indv: compute per-sample missingness rate
# Outputs .imiss file with F_MISS column (fraction missing genotypes)
$ vcftools --gzvcf chr1.dose_normalized.vcf.gz \
--missing-indv \
--out chr1_stats
for chr in {1..22}; do
vcftools --gzvcf chr${chr}.dose_normalized.vcf.gz \
--missing-indv --out chr${chr}_stats
done
Sex and ancestry validation
$ plink2 --vcf chr{1..22}.dose_normalized.vcf.gz \
--check-sex --maf 0.05 \
--out final_qc
Remove outlier samples
Imputation quality pre-filter (Step 6.3): Before per-chromosome VCF filtering,
a high-quality variant list was extracted from the Michigan Imputation Server output
(
UZB_all.HQ_imputed.R2ge0p8.MAFge0p001.tsv) applying:
R² ≥ 0.8 (imputation quality),
MAF ≥ 0.001, and
IMPUTED == 1.
This yielded
10,009,530 HQ variant IDs used as a whitelist in the filtering step below.
See
Step 6 Technical Log for full details.
$ python3 detect_outliers.py -i chr1_stats.imiss \
--threshold 0.05 --output outliers_remove.txt
Found 24 outlier samples:
- 8 samples with >5% missing data
- 10 samples with sex discordance
- 6 samples with ancestry outliers
Final filtering and export
# bcftools view: filter samples and variants from VCF
# --samples-file ^FILE: EXCLUDE samples listed in FILE
# The ^ prefix means "exclude" (without ^ it means "include only")
# --include 'ID=@FILE': keep only variants whose ID matches a line in FILE
# The @ prefix means "read values from file" (bcftools expression syntax)
# --output-type z: compressed VCF (.vcf.gz)
# --threads 4: use 4 compression threads (speeds up bgzf output)
$ for chr in {1..22}; do
bcftools view chr${chr}.dose.vcf.gz \
--samples-file ^het_outliers_vcf_format.txt \
--include 'ID=@HQ_variant_ids.txt' \
--output-type z \
--output filtered_clean/chr${chr}.clean.vcf.gz \
--threads 4
tabix -p vcf filtered_clean/chr${chr}.clean.vcf.gz
# tabix -p vcf: create .tbi index for VCF (required for region queries)
done