Schema Questions and SQL Examples

Can you show the relationship between all the tables in the ChEMBL database?

A PNG of the schema relationships can be found on the FTP site in the latest release directory: ftp://ftp.ebi.ac.uk/pub/databases/chembl/ChEMBLdb/latest

Where can I get more information about the new classifications of target types in ChEMBL_15?

This can be found in the following slides (Current for ChEMBL_15): Target Types

Retrieve all the bioactivity data for bacterial targets:

1
SELECT md.chembl_id AS compound_chembl_id,
2
cs.canonical_smiles,
3
act.standard_type,
4
act.standard_value,
5
act.standard_units,
6
td.chembl_id AS target_chembl_id,
7
td.organism, td.pref_name
8
FROM target_dictionary td
9
JOIN assays a ON td.tid = a.tid
10
JOIN activities act ON a.assay_id = act.assay_id
11
JOIN molecule_dictionary md ON act.molregno = md.molregno
12
JOIN compound_structures cs ON md.molregno = cs.molregno
13
JOIN organism_class oc ON td.tax_id = oc.tax_id
14
AND oc.L1 = 'Bacteria';
Copied!

Retrieve activity details for compound and all its salts which have an IC50 bioactivity value in nM against a target of interest:

1
-- Compound is Sildenafil (CHEMBL192)
2
-- Target is human PDE5 (CHEMBL1827)
3
SELECT m.chembl_id AS compound_chembl_id,
4
s.canonical_smiles,
5
r.compound_key,
6
NVL(TO_CHAR(d.pubmed_id),d.doi) AS pubmed_id_or_doi,
7
a.description AS assay_description,
8
act.standard_type,
9
act.standard_relation,
10
act.standard_value,
11
act.standard_units,
12
act.activity_comment,
13
t.chembl_id AS target_chembl_id,
14
t.pref_name AS target_name,
15
t.organism AS target_organism
16
FROM compound_structures s
17
RIGHT JOIN molecule_dictionary m ON s.molregno = m.molregno
18
JOIN compound_records r ON m.molregno = r.molregno
19
JOIN docs d ON r.doc_id = d.doc_id
20
JOIN activities act ON r.record_id = act.record_id
21
JOIN assays a ON act.assay_id = a.assay_id
22
JOIN target_dictionary t ON a.tid = t.tid
23
AND t.chembl_id = 'CHEMBL1827'
24
AND m.chembl_id IN
25
(SELECT DISTINCT
26
m1.chembl_id
27
FROM molecule_dictionary m1
28
JOIN molecule_hierarchy mh ON mh.molregno = m1.molregno
29
JOIN molecule_dictionary m2 ON mh.parent_molregno = m2.molregno
30
AND m2.chembl_id = 'CHEMBL192')
31
AND act.standard_type = 'IC50'
32
AND act.standard_units = 'nM';
Copied!

Retrieve compounds which are selective to one target over a second target:

1
-- Compounds which are selective for Human CDK2 (CHEMBL301) over Human CDK5 (CHEMBL4036)
2
-- Selectivity is based on comparing binding affinities using IC50 values.
3
SELECT md.chembl_id,
4
cs.canonical_smiles
5
FROM target_dictionary td
6
JOIN assays a ON td.tid = a.tid
7
JOIN activities act ON a.assay_id = act.assay_id
8
JOIN molecule_dictionary md ON md.molregno = act.molregno
9
JOIN compound_structures cs ON md.molregno = cs.molregno
10
AND act.standard_relation = '='
11
AND act.standard_type IN ('IC50')
12
AND act.standard_units = 'nM'
13
AND act.standard_value < 50
14
AND td.chembl_id = 'CHEMBL301'
15
INTERSECT
16
SELECT md.chembl_id,
17
cs.canonical_smiles
18
FROM target_dictionary td
19
JOIN assays a ON td.tid = a.tid
20
JOIN activities act ON a.assay_id = act.assay_id
21
JOIN molecule_dictionary md ON md.molregno = act.molregno
22
JOIN compound_structures cs ON md.molregno = cs.molregno
23
AND act.standard_relation = '='
24
AND act.standard_type IN ('IC50')
25
AND act.standard_units = 'nM'
26
AND act.standard_value > 200
27
AND td.chembl_id = 'CHEMBL4036';
Copied!

Retrieve target ChEMBL_ID, target_name, target_type, protein accessions and sequences for all protein targets:

1
SELECT t.chembl_id AS target_chembl_id,
2
t.pref_name AS target_name,
3
t.target_type,
4
c.accession AS protein_accession,
5
c.sequence AS protein_sequence
6
FROM target_dictionary t
7
JOIN target_type tt ON t.target_type = tt.target_type
8
JOIN target_components tc ON t.tid = tc.tid
9
JOIN component_sequences c ON tc.component_id = c.component_id
10
AND tt.parent_type = 'PROTEIN';
Copied!

Retrieve PK data from 'Curated Drug Pharmacokinetic Data' source in ChEMBL for drug:

1
-- Data for levofloxacin
2
SELECT DISTINCT
3
d.title,
4
min(decode(ap.standard_type, 'DATASET', nvl(to_char(ap.standard_value), ap.standard_text_value))) dataset,
5
a.assay_id,
6
a.description,
7
min(decode(actp.standard_type, 'DOSED_COMPOUND_NAME',
8
nvl(to_char(actp.standard_value), actp.standard_text_value) || ' ' ||
9
actp.standard_units)) dosed_compound_name,
10
min(decode(actp.standard_type, 'DOSE',
11
nvl(to_char(actp.standard_value), actp.standard_text_value) || ' ' || actp.standard_units)) dose,
12
min(decode(actp.standard_type, 'DOSAGE_FORM',
13
nvl(to_char(actp.standard_value), actp.standard_text_value) || ' ' || actp.standard_units)) dosage_form,
14
min(decode(actp.standard_type, 'REGIMEN',
15
nvl(to_char(actp.standard_value), actp.standard_text_value) || ' ' || actp.standard_units)) regimen,
16
min(decode(actp.standard_type, 'ROUTE', nvl(to_char(actp.standard_value), actp.standard_text_value))) route,
17
min(decode(actp.standard_type, 'GENDER', nvl(to_char(actp.standard_value), actp.standard_text_value))) gender,
18
min(decode(actp.standard_type, 'AGE_RANGE', nvl(to_char(actp.standard_value), actp.standard_text_value))) age_range,
19
min(decode(actp.standard_type, 'HEALTH_STATUS', nvl(to_char(actp.standard_value),
20
actp.standard_text_value))) health_status,
21
min(decode(actp.standard_type, 'TISSUE', nvl(to_char(actp.standard_value),
22
actp.standard_text_value))) tissue,
23
cr.molregno,
24
cr.compound_name,
25
act.activity_id,
26
act.toid,
27
act.standard_type,
28
act.standard_relation,
29
act.standard_value,
30
act.standard_units,
31
act.activity_comment
32
FROM source s
33
JOIN compound_records cr ON s.src_id = cr.src_id
34
JOIN docs d ON d.doc_id = cr.doc_id
35
JOIN activities act ON cr.record_id = act.record_id AND cr.doc_id = act.doc_id
36
JOIN activity_properties actp ON act.activity_id = actp.activity_id
37
JOIN assays a ON act.assay_id = a.assay_id
38
JOIN assay_parameters ap ON a.assay_id = ap.assay_id
39
AND s.src_description = 'Curated Drug Pharmacokinetic Data'
40
AND cr.compound_name LIKE 'LEVOFLOXACIN%'
41
GROUP BY d.title, a.assay_id, a.description, cr.molregno, cr.compound_name, act.activity_id, act.toid,
42
act.standard_type, act.standard_relation, act.standard_value, act.standard_units, act.activity_comment
43
ORDER BY cr.compound_name, act.toid, act.standard_type;
Copied!

Retrieve compound activity details for all targets containing a protein of interest:

1
-- Protein of interest is human M2 muscarinic receptor (P08172)
2
SELECT DISTINCT
3
m.chembl_id AS compound_chembl_id,
4
s.canonical_smiles,
5
r.compound_key,
6
NVL(TO_CHAR(d.pubmed_id), d.doi) AS pubmed_id_or_doi,
7
a.description AS assay_description,
8
act.standard_type,
9
act.standard_relation,
10
act.standard_value,
11
act.standard_units,
12
act.activity_comment,
13
t.chembl_id AS target_chembl_id,
14
t.pref_name AS target_name,
15
t.target_type
16
FROM compound_structures s
17
RIGHT JOIN molecule_dictionary m ON s.molregno = m.molregno
18
JOIN compound_records r ON m.molregno = r.molregno
19
JOIN docs d ON r.doc_id = d.doc_id
20
JOIN activities act ON r.record_id = act.record_id
21
JOIN assays a ON act.assay_id = a.assay_id
22
JOIN target_dictionary t ON a.tid = t.tid
23
JOIN target_components tc ON t.tid = tc.tid
24
JOIN component_sequences cs ON tc.component_id = cs.component_id
25
AND cs.accession = 'P08172';
Copied!

Retrieve compound activity details for a target:

1
-- Target is Human PDE5 (CHEMBL1827)
2
SELECT m.chembl_id AS compound_chembl_id,
3
s.canonical_smiles,
4
r.compound_key,
5
NVL(TO_CHAR(d.pubmed_id),d.doi) AS pubmed_id_or_doi,
6
a.description AS assay_description, act.standard_type,
7
act.standard_relation,
8
act.standard_value,
9
act.standard_units,
10
act.activity_comment
11
FROM compound_structures s,
12
molecule_dictionary m,
13
compound_records r,
14
docs d,
15
activities act,
16
assays a,
17
target_dictionary t
18
WHERE s.molregno (+) = m.molregno
19
AND m.molregno = r.molregno
20
AND r.record_id = act.record_id
21
AND r.doc_id = d.doc_id
22
AND act.assay_id = a.assay_id
23
AND a.tid = t.tid
24
AND t.chembl_id = 'CHEMBL1827';
Copied!

Can you show me how to use SQL to only extract the PubChem data from ChEMBL?

1
-- The source id for PubChem data is found in the SOURCE table and is ‘7’.
2
-- Please note that this will bring back over 4,000,000 data points
3
SELECT DISTINCT
4
md.molregno,
5
cs.canonical_smiles,
6
md.chembl_id,
7
act.standard_type,
8
act.standard_value,
9
act.standard_units
10
FROM activities act
11
JOIN molecule_dictionary md ON act.molregno = md.molregno
12
JOIN compound_structures cs ON md.molregno = cs.molregno
13
JOIN compound_records cr ON cr.molregno = act.molregno
14
JOIN source src ON src.src_id = cr.src_id
15
AND src.src_id = '7';
Copied!
Last modified 3yr ago