PUBMED
First Authors
efetch -db pubmed -id 6271474,5685784,4882854,6243420 -format xml |
xtract -pattern PubmedArticle -element MedlineCitation/PMID "#Author" \
-block Author -position first -sep " " -element Initials,LastName \
-block Article -element ArticleTitle
6271474 5 MJ Casadaban Tn3: transposition and control.
5685784 2 RK Mortimer Suppressors and suppressible mutations in yeast.
4882854 2 ED Garber Proteins and enzymes as taxonomic tools.
6243420 1 NR Cozzarelli DNA gyrase and the supercoiling of DNA.
Formatted Authors
efetch -db pubmed -id 1413997,6301692,781293 -format xml |
xtract -pattern PubmedArticle -element MedlineCitation/PMID \
-block PubDate -sep "-" -element Year,Month,MedlineDate \
-block Author -sep " " -tab "" \
-element "&COM" Initials,LastName -COM "(|)" |
perl -pe 's/(\t[^\t|]*)\|([^\t|]*)$/$1 and $2/; s/\|([^|]*)$/, and $1/; s/\|/, /g'
1413997 1992-Oct RK Mortimer, CR Contopoulou, and JS King
6301692 1983-Apr MA Krasnow and NR Cozzarelli
781293 1976-Jul MJ Casadaban
Medical Subject Headings
efetch -db pubmed -id 6092233,2539356,1937004 -format xml |
xtract -pattern PubmedArticle -element MedlineCitation/PMID \
-block MeshHeading \
-subset DescriptorName -pfc "\n" -sep "|" -element @MajorTopicYN,DescriptorName \
-subset QualifierName -pfc " / " -sep "|" -element @MajorTopicYN,QualifierName |
sed -e 's/N|//g' -e 's/Y|/*/g'
6092233
Base Sequence
DNA Restriction Enzymes
DNA, Fungal / genetics / *isolation & purification
*Genes, Fungal
...
Book Authors and Editors
efetch -db pubmed -id 21433338 -format xml |
xtract -pattern PubmedBookArticle \
-path BookDocument.AuthorList.Author -element LastName \
-path BookDocument.Book.AuthorList.Author -element LastName
Fauci Desrosiers Coffin Hughes Varmus
Heterogeneous Data
efetch -db pubmed -id 21433338,17247418 -format xml |
xtract -pattern "PubmedArticleSet/*" \
-group "Book/AuthorList" -element LastName \
-group "Article/AuthorList" -element LastName
Coffin Hughes Varmus
Lederberg Cavalli Lederberg
Multiple Links
esearch -db pubmed -query "conotoxin AND dopamine [MAJR]" |
elink -target protein -cmd neighbor |
xtract -pattern LinkSet -if Link/Id -element IdList/Id Link/Id
28666811 17105332 9506485
23624852 17105332
14657161 27532980 27532978 19424304
12944511 31542395 17105332
Markup Correction
for id in 8475897 8988608 9698410 10194376 15949988 16271163 17282049 \
19793852 20968289 21892341 22106757 22785267 22360335 23095895 23095897 \
25435818 26433210 27672066 28635620 28976125 29547395 29869631 29869640
do
efetch -db pubmed -format xml -id "$id" |
xtract -pattern PubmedArticle -plg "\n\n" -sep "\n\n" -tab "\n\n" \
-element MedlineCitation/PMID ArticleTitle Abstract/AbstractText
done
Record Counts
echo "diphtheria measles pertussis polio tuberculosis" |
xargs -n 1 sh -c 'esearch -db pubmed -query "$0 [MESH]" |
efilter -days 365 -datetype PDAT |
xtract -pattern ENTREZ_DIRECT -lbl "$0" -element Count'
diphtheria 33
measles 150
pertussis 85
polio 100
tuberculosis 1608
Elink -cited Equivalent
elink_cited() {
efetch -format uid |
sort -n | uniq |
join-into-groups-of 100 |
while read ids
do
nquire -get https://icite.od.nih.gov/api/pubs -pmids "$ids" |
xtract -j2x |
xtract -pattern opt -element cited_by
done |
word-at-a-time |
sort -n | uniq |
epost -db pubmed
}
esearch -db pubmed -query "Beadle GW [AUTH] AND Tatum EL [AUTH]" |
elink_cited |
efetch -format abstract
PMC
Formatting Tag Removal
efetch -db pmc -id 4729119 -format xml |
xtract -mixed -pattern article -group p \
-position first -tab "\n\n" -element p -plain p |
fold -w 70 -s | awk '{$1=$1};1'
The intestinal cells of Caenorhabditis elegans are
filled with heterogeneous granular organelles that are associated
with specific organ functions. The best studied of these organelles
...
The intestinal cells of Caenorhabditis elegans are filled with
heterogeneous granular organelles that are associated with specific
organ functions. The best studied of these organelles are lipid
...
SEQUENCE
Peptide Sequences
esearch -db protein -query "conotoxin AND mat_peptide [FKEY]" |
efetch -format gpc |
xtract -insd complete mat_peptide "%peptide" product mol_wt peptide |
grep -i conotoxin | sort -t $'\t' -u -k 2,2n | head -n 8
ADB43131.1 15 conotoxin Cal 1b 1708 LCCKRHHGCHPCGRT
ADB43128.1 16 conotoxin Cal 5.1 1829 DPAPCCQHPIETCCRR
AIC77105.1 17 conotoxin Lt1.4 1705 GCCSHPACDVNNPDICG
ADB43129.1 18 conotoxin Cal 5.2 2008 MIQRSQCCAVKKNCCHVG
ADD97803.1 20 conotoxin Cal 1.2 2206 AGCCPTIMYKTGACRTNRCR
AIC77085.1 21 conotoxin Bt14.8 2574 NECDNCMRSFCSMIYEKCRLK
ADB43125.1 22 conotoxin Cal 14.2 2157 GCPADCPNTCDSSNKCSPGFPG
AIC77154.1 23 conotoxin Bt14.19 2578 VREKDCPPHPVPGMHKCVCLKTC
Vitamin Biosynthesis
esearch -db pubmed -query "lycopene cyclase" |
elink -related |
elink -target protein |
efilter -organism rodents -source refseq |
efetch -format docsum |
xtract -pattern DocumentSummary -element AccessionVersion Title |
grep -i carotene
NP_001346539.1 beta,beta-carotene 9',10'-oxygenase isoform 2 [Mus musculus]
NP_573480.1 beta,beta-carotene 9',10'-oxygenase isoform 1 [Mus musculus]
NP_446100.2 beta,beta-carotene 15,15'-dioxygenase [Rattus norvegicus]
NP_001121184.1 beta,beta-carotene 9',10'-oxygenase [Rattus norvegicus]
NP_001156500.1 beta,beta-carotene 15,15'-dioxygenase isoform 2 [Mus musculus]
NP_067461.2 beta,beta-carotene 15,15'-dioxygenase isoform 1 [Mus musculus]
Coding Sequences
efetch -db nucleotide -id J01636.1 -format gbc -style withparts |
xtract -insd CDS gene sub_sequence
J01636.1 lacI GTGAAACCAGTAACGTTATACGATGTCGCAGAGTATGCCG...
J01636.1 lacZ ATGACCATGATTACGGATTCACTGGCCGTCGTTTTACAAC...
J01636.1 lacY ATGTACTATTTAAAAAACACAAACTTTTGGATGTTCGGTT...
J01636.1 lacA TTGAACATGCCAATGACCGAAAGAATAAGAGCAGGCAAGC...
Sequence Subregion
efetch -db nuccore -id U54469 -format gbc |
xtract -pattern INSDSeq -nucleic INSDSeq_sequence[2881:1] |
fold -w 60
CCGGTTTTAATGTAGGTTTTTATTAATATACTTTTCCGTCTAATCCATTATTGACAGTGA
CTACAAAAAGCGGATAGATTTTATATTATGCCGATTTTTGATAACAAAGGGGGTTCCGTT
TCGGTTTCGTTACGCGGGTCTTAGACAATAGTCACGATTAATCGCTACTGTTGCTTATAA
...
3'UTR Sequences
#!/bin/bash -norc
ThreePrimeUTRs() {
xtract -pattern INSDSeq -ACC INSDSeq_accession-version -SEQ INSDSeq_sequence \
-block INSDFeature -if INSDFeature_key -equals CDS \
-pfc "\n" -element "&ACC" -rst -last INSDInterval_to -element "&SEQ" |
while read acc pos seq
do
if [ $pos -lt ${#seq} ]
then
echo -e ">$acc 3'UTR: $((pos+1))..${#seq}"
echo "${seq:$pos}" | fold -w 50
elif [ $pos -ge ${#seq} ]
then
echo -e ">$acc NO 3'UTR"
fi
done
}
esearch -db nuccore -query "5.5.1.19 [ECNO]" |
efilter -molecule mrna -source refseq |
efetch -format gbc | ThreePrimeUTRs
>NM_001328461.1 3'UTR: 1737..1871
gatgaatatagagttactgtgttgtaagctaatcatcatactgatgcaag
tgcattatcacatttacttctgctgatgattgttcataagattatgagtt
agccatttatcaaaaaaaaaaaaaaaaaaaaaaaa
>NM_001316759.1 3'UTR: 1628..1690
atccgagtaattcggaatcttgtccaattttatatagcctatattaatac
...
Amino Acid Composition
#!/bin/bash -norc
abbrev=( Ala Asx Cys Asp Glu Phe Gly His Ile \
Xle Lys Leu Met Asn Pyl Pro Gln Arg \
Ser Thr Sec Val Trp Xxx Tyr Glx )
AminoAcidComp() {
local count
while read num lttr
do
idx=$(printf %i "'$lttr'")
ofs=$((idx-97))
count[$ofs]="$num"
done <<< "$1"
for i in {0..25}
do
echo -e "${abbrev[$i]}\t${count[$i]-0}"
done |
sort
}
AminoAcidJoin() {
result=""
while read acc seq gene
do
comp="$(echo "$seq" | tr A-Z a-z | sed 's/[^a-z]//g' | fold -w 1 | sort-uniq-count)"
current=$(AminoAcidComp "$comp")
current=$(echo -e "GENE\t$gene\n$current")
if [ -n "$result" ]
then
result=$(join -t $'\t' <(echo "$result") <(echo "$current"))
else
result=$current
fi
done
echo "$result" |
grep -e "GENE" -e "[1-9]"
}
ids="NP_001172026,NP_000509,NP_004001,NP_001243779"
efetch -db protein -id "$ids" -format gpc |
xtract -insd INSDSeq_sequence CDS gene |
AminoAcidJoin
GENE INS HBB DMD TTN
Ala 10 15 210 2084
Arg 5 3 193 1640
Asn 3 6 153 1111
Asp 2 7 185 1720
Cys 6 2 35 513
Gln 7 3 301 942
Glu 8 8 379 3193
Gly 12 13 104 2066
His 2 9 84 478
Ile 2 0 165 2062
Leu 20 18 438 2117
Lys 2 11 282 2943
Met 2 2 79 398
Phe 3 8 77 908
Pro 6 7 130 2517
Ser 5 5 239 2463
Thr 3 7 194 2546
Trp 2 2 67 466
Tyr 4 3 61 999
Val 6 18 186 3184
GENE
Chromosome Assignments
esearch -db gene -query "calmodulin [PFN] AND mammalia [ORGN]" |
efetch -format docsum |
xtract -pattern DocumentSummary \
-def "-" -element Id Name MapLocation ScientificName
801 CALM1 14q32.11 Homo sapiens
808 CALM3 19q13.32 Homo sapiens
805 CALM2 2p21 Homo sapiens
24242 Calm1 6q32 Rattus norvegicus
12313 Calm1 12 E Mus musculus
326597 CALM - Bos taurus
50663 Calm2 6q12 Rattus norvegicus
24244 Calm3 1q21 Rattus norvegicus
12315 Calm3 7 9.15 cM Mus musculus
12314 Calm2 17 E4 Mus musculus
617095 CALM1 - Bos taurus
396838 CALM3 6 Sus scrofa
...
Genome Range
esearch -db gene -query "Homo sapiens [ORGN] AND Y [CHR]" |
efilter -status alive | efetch -format docsum |
xtract -pattern DocumentSummary -NAME Name -DESC Description \
-block GenomicInfoType -if ChrLoc -equals Y \
-min ChrStart,ChrStop -element "&NAME" "&DESC" |
sort -k 1,1n | cut -f 2- |
grep -v pseudogene | grep -v uncharacterized |
between-two-genes ASMT IL3RA
IL3RA interleukin 3 receptor subunit alpha
SLC25A6 solute carrier family 25 member 6
LINC00106 long intergenic non-protein coding RNA 106
ASMTL-AS1 ASMTL antisense RNA 1
ASMTL acetylserotonin O-methyltransferase-like
P2RY8 purinergic receptor P2Y8
AKAP17A A-kinase anchoring protein 17A
ASMT acetylserotonin O-methyltransferase
Centromere Position
nquire -ftp ftp.ncbi.nlm.nih.gov pub/gdp ideogram_9606_GCF_000001305.14_850_V1 |
grep acen | cut -f 1,2,6,7 | grep "^X\t"
X p 58100001 61000000
X q 61000001 63800000
Gene Regions
esearch -db gene -query "DDT [GENE] AND mouse [ORGN]" |
efetch -format docsum |
xtract -pattern GenomicInfoType -element ChrAccVer ChrStart ChrStop |
xargs -n 3 sh -c 'efetch -db nuccore -format gb \
-id "$0" -chr_start "$1" -chr_stop "$2"'
LOCUS NC_000076 2142 bp DNA linear CON 09-FEB-2015
DEFINITION Mus musculus strain C57BL/6J chromosome 10, GRCm38.p3 C57BL/6J.
ACCESSION NC_000076 REGION: complement(75771233..75773374) GPC_000000783
VERSION NC_000076.6
...
FEATURES Location/Qualifiers
source 1..2142
/organism="Mus musculus"
/mol_type="genomic DNA"
/strain="C57BL/6J"
/db_xref="taxon:10090"
/chromosome="10"
gene 1..2142
/gene="Ddt"
mRNA join(1..159,462..637,1869..2142)
/gene="Ddt"
/product="D-dopachrome tautomerase"
/transcript_id="NM_010027.1"
CDS join(52..159,462..637,1869..1941)
/gene="Ddt"
/codon_start=1
/product="D-dopachrome decarboxylase"
/protein_id="NP_034157.1"
/translation="MPFVELETNLPASRIPAGLENRLCAATATILDKPEDRVSVTIRP
GMTLLMNKSTEPCAHLLVSSIGVVGTAEQNRTHSASFFKFLTEELSLDQDRIVIRFFP
...
Recursive Data
esearch -db gene -query "rbcL [GENE] AND maize [ORGN]" |
efetch -format xml |
xtract -pattern Entrezgene -block "**/Gene-commentary" \
-if Gene-commentary_type@value -equals genomic \
-tab "\n" -element Gene-commentary_accession |
sort | uniq
NC_001666
X86563
Z11973
Genes in Pathways
esearch -db gene -query "PAH [GENE]" -organism human |
elink -target biosystems |
efilter -pathway wikipathways |
elink -target gene |
efetch -format docsum |
xtract -pattern DocumentSummary -element Name Id Description |
grep -v pseudogene | grep -v uncharacterized |
sort -f
AANAT 15 aralkylamine N-acetyltransferase
ACADM 34 acyl-CoA dehydrogenase medium chain
ACHE 43 acetylcholinesterase (Cartwright blood group)
ADCYAP1 116 adenylate cyclase activating polypeptide 1
...
Gene Products
for sym in HBB DMD TTN ATP7B HFE BRCA2 CFTR PAH PRNP RAG1
do
esearch -db gene -query "$sym [GENE] AND human [ORGN]" |
efilter -query "alive [PROP]" | efetch -format docsum |
xtract -pattern GenomicInfoType \
-element ChrAccVer ChrStart ChrStop |
while read acc str stp
do
efetch -db nuccore -format gbc \
-id "$acc" -chr_start "$str" -chr_stop "$stp" |
xtract -insd CDS,mRNA INSDFeature_key "#INSDInterval" \
gene "%transcription" "%translation" \
product transcription translation |
grep -i $'\t'"$sym"$'\t'
done
done
NC_000011.10 mRNA 3 HBB 626 hemoglobin, beta ACATTTGCTT...
NC_000011.10 CDS 3 HBB 147 hemoglobin subunit beta MVHLTPEEKS...
NC_000023.11 mRNA 78 DMD 13805 dystrophin, transcript variant X2 AGGAAGATGA...
NC_000023.11 mRNA 77 DMD 13794 dystrophin, transcript variant X6 ACTTTCCCCC...
NC_000023.11 mRNA 77 DMD 13800 dystrophin, transcript variant X5 ACTTTCCCCC...
NC_000023.11 mRNA 77 DMD 13785 dystrophin, transcript variant X7 ACTTTCCCCC...
NC_000023.11 mRNA 74 DMD 13593 dystrophin, transcript variant X8 ACTTTCCCCC...
NC_000023.11 mRNA 75 DMD 13625 dystrophin, transcript variant X9 ACTTTCCCCC...
...
Unfiltered Gene Lookup
for sym in ATP6 CBD HBB OPN1MW
do
esearch -db gene -query "$sym [GENE]" -organism human |
efetch -format docsum |
xtract -pattern DocumentSummary -def "-" -lbl "${sym}" \
-element NomenclatureSymbol Id Description CommonName
done
ATP6 MT-ATP6 4508 ATP synthase F0 subunit 6 human
ATP6 - 6775074 ATP synthase F0 subunit 6 Neandertal
ATP6 - 8923188 ATP synthase F0 subunit 6 Denisova hominin
CBD OPN1MW 2652 opsin 1, medium wave sensitive human
HBB HBB 3043 hemoglobin subunit beta human
HBB KRT89P 85344 keratin 89 pseudogene human
OPN1MW OPN1MW 2652 opsin 1, medium wave sensitive human
OPN1MW OPN1MW3 101060233 opsin 1, medium wave sensitive 3 human
Protein Coding Genes
for sym in MT-ATP6 BRCA2 CFTR HBB HFE IL9R OPN1MW PAH
do
esearch -db gene -query "$sym [GENE]" -organism human |
efilter -status alive -type coding |
efetch -format docsum |
xtract -pattern DocumentSummary \
-if NomenclatureSymbol -equals "${sym}" \
-lbl "${sym}" -element Id Chromosome Description
done |
...
MT-ATP6 4508 MT ATP synthase F0 subunit 6
BRCA2 675 13 BRCA2 DNA repair associated
CFTR 1080 7 CF transmembrane conductance regulator
HBB 3043 11 hemoglobin subunit beta
HFE 3077 6 homeostatic iron regulator
IL9R 3581 X, Y interleukin 9 receptor
OPN1MW 2652 X opsin 1, medium wave sensitive
PAH 5053 12 phenylalanine hydroxylase
Common Pathways
...
while IFS=$'\t' read sym uid chr desc
do
elink -db gene -id "$uid" -target biosystems |
efilter -kind pathway |
efetch -format docsum |
xtract -pattern DocumentSummary -lbl "${sym}" \
-lower source -element externalid biosystemname
done |
sort -t $'\t' -k 2,2 -k 3,3 -k 1,1 |
awk 'a[$3]++{ if(a[$3]==2){ print b }; print $0}; {b=$0}'
MT-ATP6 kegg hsa01100 Metabolic pathways
PAH kegg hsa01100 Metabolic pathways
HBB reactome R-HSA-1430728 Metabolism
MT-ATP6 reactome R-HSA-1430728 Metabolism
PAH reactome R-HSA-1430728 Metabolism
CFTR reactome R-HSA-162582 Signal Transduction
OPN1MW reactome R-HSA-162582 Signal Transduction
...
TAXONOMY
Taxonomic Names
esearch -db taxonomy -query "txid10090 [SBTR] OR camel [COMN]" |
efetch -format docsum |
xtract -pattern DocumentSummary -if CommonName \
-element Id ScientificName CommonName
57486 Mus musculus molossinus Japanese wild mouse
39442 Mus musculus musculus eastern European house mouse
35531 Mus musculus bactrianus southwestern Asian house mouse
10092 Mus musculus domesticus western European house mouse
10091 Mus musculus castaneus southeastern Asian house mouse
10090 Mus musculus house mouse
9838 Camelus dromedarius Arabian camel
9837 Camelus bactrianus Bactrian camel
STRUCTURE
Structural Similarity
esearch -db structure -query "crotalus [ORGN] AND phospholipase A2" |
elink -related |
efilter -query "archaea [ORGN]" |
efetch -format docsum |
xtract -pattern DocumentSummary \
-if PdbClass -equals Hydrolase \
-element PdbAcc PdbDescr
3WIV Crystal Structure Of Pro-s324a/d356a
3WIU Crystal Structure Of Pro-s324a/l349a
3VV2 Crystal Structure Of Complex Form Between S324a-subtilisin And Mutant Tkpro
3VHQ Crystal Structure Of The Ca6 Site Mutant Of Pro-Sa-Subtilisin
2ZWP Crystal Structure Of Ca3 Site Mutant Of Pro-S324a
...
SNP
Amino Acid Substitutions
esearch -db gene -query "OPN1MW [GENE] AND human [ORGN]" |
elink -target snp | efetch -format json |
xtract -j2x -set - -rec RS |
xtract -pattern RS -pfx "rs" -RSID RS/refsnp_id \
-group protein -if name -contains missense \
-block variant -element "&RSID" seq_id \
inserted_sequence -tab "\n" -inc position |
sort -t $'\t' -k 2,2 -k 4,4n -k 3,3f -k 1.3n | uniq |
while read rsid accn res pos
do
if [ "$accn" != "$last" ]
then
seq=$(efetch -db protein -id "$accn" -format gpc < /dev/null |
xtract -pattern INSDSeq -element INSDSeq_sequence)
last=$accn
fi
echo ">$rsid [$accn $res@$pos]"
echo "${seq:0:$pos-1}$res${seq:$pos}" | fold -w 50
done
>rs1238141906 [NP_000504.1 K@41]
maqqwslqrlagrhpqdsyedstqssiftytnsnstrgpfKgpnyhiapr
wvyhltsvwmifvviasvftnglvlaatmkfkklrhplnwilvnlavadl
aetviastisvvnqvygyfvlghpmcvlegytvslcgitglwslaiiswe
...
Sequences Flanking SNPs
#!/bin/bash -norc
efetch -db snp -id 268 -format json |
xtract -j2x -set - -rec RS |
xtract -pattern RS -pfx "rs" -RSID RS/refsnp_id \
-group placements_with_allele \
-block allele -if seq_id -starts-with "NC_" \
-and inserted_sequence -differs-from deleted_sequence \
-element "&RSID" seq_id deleted_sequence \
inserted_sequence -tab "\n" -inc position |
sort -t $'\t' -k 2,2 -k 5,5n -k 4,4f -k 1.3n | uniq |
while read rsid accn del ins pos
do
lft=$(efetch -db nuccore -format fasta -id "$accn" \
-seq_start "$((pos-50))" -seq_stop "$((pos-1))" < /dev/null |
grep -v '>' | tr -d '\n')
ad=${#ins}
sb=${#del}
rgt=$(efetch -db nuccore -format fasta -id "$accn" \
-seq_start "$((pos+ad-sb+1))" -seq_stop "$((pos+ad-sb+50))" < /dev/null |
grep -v '>' | tr -d '\n')
echo "$rsid $accn $pos $del->$ins"
echo "5': $lft"
echo "3': $rgt"
echo ""
done
rs268 NC_000008.10 19813529 A->G
5': CTGCTTGAGTTGTAGAAAGAACCGCTGCAACAATCTGGGCTATGAGATCA
3': TAAAGTCAGAGCCAAAAGAAGCAGCAAAATGTACCTGAAGACTCGTTCTC
rs268 NC_000008.11 19956018 A->G
5': CTGCTTGAGTTGTAGAAAGAACCGCTGCAACAATCTGGGCTATGAGATCA
3': TAAAGTCAGAGCCAAAAGAAGCAGCAAAATGTACCTGAAGACTCGTTCTC
EXTERNAL
JSON Nested Array Expansion
for ns in flat recurse plural depth
do
echo " $ns"
echo
nquire -get "http://mygene.info/v3" gene 2652 |
xtract -j2x -set - -rec GeneRec -nest "$ns" |
grep position | head -n 4
echo
done
"position": [
[
154182595,
154182789
],
[
154187769,
154188066
],
flat
154182595
154182789
154187769
154188066
recurse
154182595
154182789
plural
154182595
154182789
depth
154182595
154182789
Exon Interval Sets
nquire -get "http://mygene.info/v3/gene/2652" |
xtract -j2x -set - -rec GeneRec -nest plural |
xtract -pattern GeneRec -group exons -lbl "" -clr \
-block positions -pfc "\n" -sep ".." -tab "\n" -element position
154182595..154182789
154187769..154188066
154190053..154190222
154191687..154191853
154193407..154193647
154195929..154196861
154219733..154219927
154224907..154225204
...
Heterogeneous Object Names
nquire -get "http://mygene.info/v3/gene/2652" |
xtract -j2x -set - -rec GeneRec |
xtract -pattern GeneRec -group "pathway/*" -pfx "\n" -element "?,name,id"
R-HSA-162582
Signal Transduction
...
WP455
GPCRs, Class A Rhodopsin-like
reactome Signal Transduction R-HSA-162582
reactome Disease R-HSA-1643685
reactome The retinoid cycle in cones (daylight vision) R-HSA-2187335
reactome Visual phototransduction R-HSA-2187338
reactome Retinoid cycle disease events R-HSA-2453864
reactome Diseases associated with visual transduction R-HSA-2474795
reactome Signaling by GPCR R-HSA-372790
reactome Class A/1 (Rhodopsin-like receptors) R-HSA-373076
reactome GPCR downstream signalling R-HSA-388396
reactome G alpha (i) signalling events R-HSA-418594
reactome Opsins R-HSA-419771
reactome GPCR ligand binding R-HSA-500792
reactome Diseases of signal transduction R-HSA-5663202
wikipathways GPCRs, Class A Rhodopsin-like WP455
XML Namespace Prefixes
nquire -url "http://webservice.wikipathways.org" getPathway -pwId WP455 |
xtract -pattern "ns1:getPathwayResponse" -element ":gpml" |
transmute -decode64 |
xtract -pattern Pathway -block Xref \
-if @Database -equals "Entrez Gene" \
-tab "\n" -element @ID |
sort -n
134
135
136
140
146
...
LOCAL ARCHIVE
Entrez Indexing
efetch -db pubmed -id 12857958,2981625 -format xml |
xtract -e2index |
xtract -pattern IdxDocument -UID IdxUid \
-block NORM -pfc "\n" -element "&UID",NORM,"@pos"
12857958 allow 205
12857958 assays 147
12857958 binding 146
12857958 braid 187,215
12857958 braiding 153
...
Author Frequency
esearch -db pubmed -query "rattlesnake phospholipase" |
efetch -format uid | fetch-pubmed |
xtract -pattern PubmedArticle -block Author \
-sep " " -tab "\n" -element LastName,Initials |
sort-uniq-count-rank
40 Marangoni S
33 Toyama MH
28 Soares AM
25 Bon C
...
Author Counts
esearch -db pubmed -query "conotoxin" |
efetch -format uid | fetch-pubmed |
xtract -pattern PubmedArticle -num Author |
sort-uniq-count -n |
reorder-columns 2 1 |
head -n 15 |
tee /dev/tty |
xy-plot auth.png
0 11
1 193
2 854
3 844
4 699
5 588
6 439
7 291
8 187
9 124
10 122
11 58
12 33
13 18
900 +
| ********
800 + * **
| * *
700 + * ***
| * **
600 + * *
| * ***
500 + * **
| * ***
400 + * **
| * *
300 + * ***
| * *
200 + * ******
| * *********
100 + ** *
| * **********
0 + * ******
+---------+---------+---------+---------+---------+---------+---------+
0 2 4 6 8 10 12 14
Title and Abstract Word Counts
esearch -db pubmed -query "conotoxin" -pub structured |
efetch -format uid | fetch-pubmed |
xtract -stops -wrp "Set,Rec" \
-pattern PubmedArticle -wrp "PMID" -element MedlineCitation/PMID \
-wrp "Titl" -words ArticleTitle \
-block Abstract/AbstractText -wrp "Grp,Abst" -words AbstractText |
xtract -pattern Rec -element PMID -num Titl -block Grp -tab ", " -num Abst
29194563 21 63, 84, 89, 26
28882644 23 87, 34, 115, 25
28877214 10 12, 42, 315, 94
28825343 15 169
28482835 9 75, 123, 42, 37
28479398 15 170, 130
...
Verbosity Per Year
esearch -db pubmed -query "PNAS [JOUR]" -pub abstract |
efetch -format uid | stream-pubmed | gunzip -c |
xtract -stops -wrp Set,Rec -pattern PubmedArticle \
-wrp "Year" -year "PubDate/*" \
-wrp "Abst" -words Abstract/AbstractText |
xtract -wrp Set,Pub -pattern Rec \
-wrp "Year" -element Year \
-wrp "Num" -num Abst > countsByYear.xml
for yr in {1960..2020}
do
cat countsByYear.xml |
xtract -wrp Raw -pattern Pub -select Year -eq "$yr" |
xtract -pattern Raw -lbl "$yr" -avg Num
done |
tee /dev/tty |
xy-plot verbosity.png
rm countsByYear.xml
Appending Metadata
esearch -db pubmed -query "PNAS [JOUR]" -pub abstract |
efetch -format uid | fetch-pubmed > pnas.xml
cat pnas.xml |
xtract -stops -wrp Set,Rec -pattern PubmedArticle \
-wrp ID -element MedlineCitation/PMID \
-wrp Abst -words Abstract/AbstractText |
31822623 foxp3cd4regulatory...
...
xtract -pattern Rec -element ID -wrp Num -num Abst > counts.txt
31822623 243
31822622 132
31822621 252
31822620 238
...
xtract -input pnas.xml -wrp PubmedArticleSet -pattern PubmedArticle \
-select MedlineCitation/PMID -appending counts.txt > merged.xml
LOCAL INDEX
Histogram Shortcut
cat $EDIRECT_PUBMED_MASTER/Current/*.xml |
xtract -timer -pattern PubmedArticle -histogram PubDate/Month
26 8
37 9
121475 01
114579 02
111137 03
109794 04
120169 05
130062 06
125107 07
126246 08
123191 09
120957 10
109657 11
110854 12
1958892 Apr
1809730 Aug
2086169 Dec
1844717 Feb
1851803 Jan
1784258 Jul
2015942 Jun
1943325 Mar
1815691 May
1889194 Nov
2035632 Oct
1 October
1956569 Sep
Month Format Per Year
cat $EDIRECT_PUBMED_MASTER/Current/*.xml |
xtract -wrp Set,Rec -pattern PubmedArticle \
-if PubDate/Month -wrp YR -year "PubDate/*" -wrp MN -len PubDate/Month |
xtract -wrp Set,Rec -pattern Rec \
-pfx "" -sep "+-" -sfx "-" -element YR,MN |
xtract -pattern Rec -histogram DT |
reorder-columns 2 1 | tr '+' '\t' |
sed -e 's/-3-/1/g' -e 's/-2-/2/g' -e 's/-1-/3/g' -e 's/-[0-9]-/4/g' |
sort -k 1,1n -k 2,2n > rawMonthCounts.txt
result=$( cat rawMonthCounts.txt | cut -f 1 | uniq )
for i in {1..4}
do
current=$( cat rawMonthCounts.txt | grep "\t$i\t" | cut -f 1,3 )
result=$(join -a 1 -t $'\t' <(echo "$result") <(echo "$current"))
done
echo "$result" > plotme.txt
cat plotme.txt | xy-plot
Phrase Query Automation
ascend_mesh_tree() {
var="${1%\*}"
while :
do
phrase-search -count "$var* [TREE]"
case "$var" in
*.* ) var="${var%????}" ;;
* ) break ;;
esac
done
}
ascend_mesh_tree "C01.925.782.417.415"
5148 c01 925 782 417 415*
26792 c01 925 782 417*
607883 c01 925 782*
870516 c01 925*
2541697 c01*
Medical Subject Heading Code Viewers
https://meshb.nlm.nih.gov/treeView
https://meshb-prev.nlm.nih.gov/treeView
MISCELLANEOUS
Indexed Fields
einfo -db pubmed |
xtract -pattern Field \
-if IsDate -equals Y -and IsHidden -equals N \
-pfx "[" -sep "]\t" -element Name,FullName |
sort -t $'\t' -k 2f
[CDAT] Date - Completion
[CRDT] Date - Create
[EDAT] Date - Entrez
[MHDA] Date - MeSH
[MDAT] Date - Modification
[PDAT] Date - Publication
Pseudocode Prototype
for each PubmedArticle {
for each Author {
print Initials LastName
}
for each MeshHeading {
print DescriptorName
for each QualifierName {
print QualifierName
}
}
}
xtract -pattern PubmedArticle \
-block Author -element Initials LastName \
-block MeshHeading -element DescriptorName \
-subset QualifierName -element QualifierName
Processing in Groups
...
efetch -format acc |
join-into-groups-of 200 |
xargs -n 1 sh -c 'epost -db nuccore -format acc -id "$0" |
efetch -format gb'