Schema Questions and SQL Examples

Can you show the relationship between all the tables in the ChEMBL database?

A PNG of the schema relationships can be found on the FTP site in the latest release directory: ftp://ftp.ebi.ac.uk/pub/databases/chembl/ChEMBLdb/latest

It is also possible to view an interactive schema diagram on the ChEMBL interface: https://www.ebi.ac.uk/chembl/db_schema

Retrieve all the bioactivity data for bacterial targets:

SELECT md.chembl_id AS compound_chembl_id,
cs.canonical_smiles,
act.standard_type,
act.standard_value,
act.standard_units,
td.chembl_id AS target_chembl_id,
td.organism,   td.pref_name
FROM target_dictionary td
  JOIN assays a ON td.tid = a.tid
  JOIN activities act ON a.assay_id = act.assay_id
  JOIN molecule_dictionary md ON act.molregno = md.molregno
  JOIN compound_structures cs ON md.molregno   = cs.molregno
  JOIN organism_class oc ON td.tax_id = oc.tax_id
    AND oc.L1 = 'Bacteria';

Retrieve activity details for compound and all its salts which have an IC50 bioactivity value in nM against a target of interest:

-- Compound is Sildenafil (CHEMBL192)
-- Target is human PDE5 (CHEMBL1827)
SELECT m.chembl_id AS compound_chembl_id,
s.canonical_smiles,
r.compound_key,
NVL(TO_CHAR(d.pubmed_id),d.doi) AS pubmed_id_or_doi,
a.description                   AS assay_description,
act.standard_type,
act.standard_relation,
act.standard_value,
act.standard_units,
act.activity_comment,
t.chembl_id                    AS target_chembl_id,
t.pref_name                    AS target_name,
t.organism                     AS target_organism
FROM compound_structures s
  RIGHT JOIN molecule_dictionary m ON s.molregno = m.molregno
  JOIN compound_records r ON m.molregno = r.molregno
  JOIN docs d ON r.doc_id = d.doc_id
  JOIN activities act ON r.record_id = act.record_id
  JOIN assays a ON act.assay_id = a.assay_id
  JOIN target_dictionary t ON a.tid = t.tid
    AND t.chembl_id      = 'CHEMBL1827'
    AND m.chembl_id IN
         (SELECT DISTINCT
            m1.chembl_id
          FROM molecule_dictionary m1
            JOIN molecule_hierarchy mh ON mh.molregno = m1.molregno
            JOIN molecule_dictionary m2 ON mh.parent_molregno = m2.molregno
              AND m2.chembl_id = 'CHEMBL192')
    AND act.standard_type = 'IC50'
    AND act.standard_units = 'nM';

Retrieve compounds which are selective to one target over a second target:

-- Compounds which are selective for Human CDK2 (CHEMBL301) over Human CDK5 (CHEMBL4036) 
-- Selectivity is based on comparing binding affinities using IC50 values.  
SELECT md.chembl_id,
cs.canonical_smiles
FROM target_dictionary td
  JOIN assays a ON td.tid = a.tid
  JOIN activities act ON a.assay_id = act.assay_id
  JOIN molecule_dictionary md ON md.molregno = act.molregno
  JOIN compound_structures cs ON md.molregno = cs.molregno
    AND act.standard_relation = '='
    AND act.standard_type     IN ('IC50')
    AND act.standard_units    = 'nM'
    AND act.standard_value    < 50
    AND td.chembl_id          = 'CHEMBL301'
INTERSECT
SELECT md.chembl_id,
cs.canonical_smiles
FROM target_dictionary td
  JOIN assays a ON td.tid = a.tid
  JOIN activities act ON a.assay_id = act.assay_id
  JOIN molecule_dictionary md ON md.molregno = act.molregno
  JOIN compound_structures cs ON md.molregno = cs.molregno
AND act.standard_relation     = '='
AND act.standard_type         IN ('IC50')
AND act.standard_units        = 'nM'
AND act.standard_value        > 200
AND td.chembl_id              = 'CHEMBL4036';

Retrieve target ChEMBL_ID, target_name, target_type, protein accessions and sequences for all protein targets:

SELECT t.chembl_id AS target_chembl_id,
t.pref_name        AS target_name,
t.target_type,
c.accession        AS protein_accession,
c.sequence         AS protein_sequence
FROM target_dictionary t
  JOIN target_type tt ON t.target_type = tt.target_type
  JOIN target_components tc ON t.tid = tc.tid
  JOIN component_sequences c ON tc.component_id = c.component_id
AND tt.parent_type  = 'PROTEIN';

Retrieve PK data from 'Curated Drug Pharmacokinetic Data' source in ChEMBL for drug:

-- Data for levofloxacin
SELECT DISTINCT
  d.title,
  min(decode(ap.standard_type, 'DATASET', nvl(to_char(ap.standard_value), ap.standard_text_value)))         dataset,
  a.assay_id,
  a.description,
  min(decode(actp.standard_type, 'DOSED_COMPOUND_NAME',
             nvl(to_char(actp.standard_value), actp.standard_text_value) || ' ' ||
             actp.standard_units))                                                                          dosed_compound_name,
  min(decode(actp.standard_type, 'DOSE',
             nvl(to_char(actp.standard_value), actp.standard_text_value) || ' ' || actp.standard_units))    dose,
  min(decode(actp.standard_type, 'DOSAGE_FORM',
             nvl(to_char(actp.standard_value), actp.standard_text_value) || ' ' || actp.standard_units))    dosage_form,
  min(decode(actp.standard_type, 'REGIMEN',
             nvl(to_char(actp.standard_value), actp.standard_text_value) || ' ' || actp.standard_units))    regimen,
  min(decode(actp.standard_type, 'ROUTE', nvl(to_char(actp.standard_value), actp.standard_text_value)))     route,
  min(decode(actp.standard_type, 'GENDER', nvl(to_char(actp.standard_value), actp.standard_text_value)))    gender,
  min(decode(actp.standard_type, 'AGE_RANGE', nvl(to_char(actp.standard_value), actp.standard_text_value))) age_range,
  min(decode(actp.standard_type, 'HEALTH_STATUS', nvl(to_char(actp.standard_value),
                                                      actp.standard_text_value)))                           health_status,
  min(decode(actp.standard_type, 'TISSUE', nvl(to_char(actp.standard_value),
                                                      actp.standard_text_value)))                           tissue,
  cr.molregno,
  cr.compound_name,
  act.activity_id,
  act.toid,
  act.standard_type,
  act.standard_relation,
  act.standard_value,
  act.standard_units,
  act.activity_comment
FROM source s
  JOIN compound_records cr ON s.src_id = cr.src_id
  JOIN docs d ON d.doc_id = cr.doc_id
  JOIN activities act ON cr.record_id = act.record_id AND cr.doc_id = act.doc_id
  JOIN activity_properties actp ON act.activity_id = actp.activity_id
  JOIN assays a ON act.assay_id = a.assay_id
  JOIN assay_parameters ap ON a.assay_id = ap.assay_id
                              AND s.src_description = 'Curated Drug Pharmacokinetic Data'
                              AND cr.compound_name LIKE 'LEVOFLOXACIN%'
GROUP BY d.title, a.assay_id, a.description, cr.molregno, cr.compound_name, act.activity_id, act.toid,
  act.standard_type, act.standard_relation, act.standard_value, act.standard_units, act.activity_comment
ORDER BY cr.compound_name, act.toid, act.standard_type;

Retrieve compound activity details for all targets containing a protein of interest:

-- Protein of interest is human M2 muscarinic receptor (P08172) 
SELECT DISTINCT
  m.chembl_id                      AS compound_chembl_id,
  s.canonical_smiles,
  r.compound_key,
  NVL(TO_CHAR(d.pubmed_id), d.doi) AS pubmed_id_or_doi,
  a.description                    AS assay_description,
  act.standard_type,
  act.standard_relation,
  act.standard_value,
  act.standard_units,
  act.activity_comment,
  t.chembl_id                      AS target_chembl_id,
  t.pref_name                      AS target_name,
  t.target_type
FROM compound_structures s
  RIGHT JOIN molecule_dictionary m ON s.molregno = m.molregno
  JOIN compound_records r ON m.molregno = r.molregno
  JOIN docs d ON r.doc_id = d.doc_id
  JOIN activities act ON r.record_id = act.record_id
  JOIN assays a ON act.assay_id = a.assay_id
  JOIN target_dictionary t ON a.tid = t.tid
  JOIN target_components tc ON t.tid = tc.tid
  JOIN component_sequences cs ON tc.component_id = cs.component_id
    AND cs.accession = 'P08172';

Retrieve compound activity details for a target:

 -- Target is Human PDE5 (CHEMBL1827) 
SELECT m.chembl_id AS compound_chembl_id,   
s.canonical_smiles,   
r.compound_key,   
NVL(TO_CHAR(d.pubmed_id),d.doi) AS pubmed_id_or_doi,   
a.description                   AS assay_description,   act.standard_type,   
act.standard_relation,   
act.standard_value,   
act.standard_units,   
act.activity_comment 
FROM chembl.compound_structures s
 RIGHT JOIN chembl.molecule_dictionary m on s.molregno = m.molregno 
 JOIN chembl.compound_records r on m.molregno = r.molregno  
 JOIN chembl.docs d on r.doc_id = d.doc_id 
 JOIN chembl.activities act on r.record_id = act.record_id
 JOIN chembl.assays a on act.assay_id = a.assay_id 
 JOIN chembl.target_dictionary t on a.tid = t.tid 
AND t.chembl_id = 'CHEMBL1827';

Can you show me how to use SQL to only extract the PubChem data from ChEMBL?

-- The source id for PubChem data is found in the SOURCE table and is ‘7’.
-- Please note that this will bring back over 4,000,000 data points
SELECT DISTINCT
  md.molregno,
  cs.canonical_smiles,
  md.chembl_id,
  act.standard_type,
  act.standard_value,
  act.standard_units
FROM activities act
  JOIN molecule_dictionary md ON act.molregno = md.molregno
  JOIN compound_structures cs ON md.molregno = cs.molregno
  JOIN compound_records cr ON cr.molregno = act.molregno
  JOIN source src ON src.src_id = cr.src_id
    AND src.src_id = '7';

Last updated