PS7: How Exoplanet Detection Works, and What Data We Actually Need

A plain explanation for the team. Companion to ps7-exoplanet-light-curves, ps7-data-strategy and decision-ps3-vs-ps7. Built around the same instinct from the Mallorn challenge: different phenomena leave different patterns in the flux.

1. What we are actually building

A pipeline that takes light curves (one star's brightness measured every two minutes) and, for each star, answers three questions:

  1. Is there a periodic dip at all?
  2. What kind of dip is it: a planet, two stars eclipsing, a blend, or a detector artefact?
  3. If it is a planet, what are its depth, period and duration, and how confident are we?

The hard, prize winning part is question 2: telling a real planet apart from things that only look like one.

2. Anatomy of a single transit

When a planet crosses its star, the brightness drops by a tiny amount, then recovers. Three numbers describe it. Depth tells you the planet size. Duration tells you how long the crossing took. Period tells you how often it repeats. Drag the controls and toggle the eclipsing binary to see why shape matters.

<div style="font-family:system-ui;color:var(--color-text);width:100%">
  <canvas id=cv style="width:100%;height:220px;display:block"></canvas>
  <div style="display:flex;flex-wrap:wrap;gap:16px;margin-top:8px;font-size:13px">
    <label>Planet size (depth) <input id=depth type=range min=5 max=60 value=25></label>
    <label>Transit duration <input id=dur type=range min=10 max=45 value=22></label>
    <label><input id=v type=checkbox> Eclipsing binary (V-shape)</label>
  </div>
  <div id=cap style="margin-top:6px;color:var(--color-text-secondary);font-size:12px"></div>
</div>
<script>
const cv=document.getElementById('cv'),ctx=cv.getContext('2d');
const D=document.getElementById('depth'),U=document.getElementById('dur'),V=document.getElementById('v'),cap=document.getElementById('cap');
D.value=store.get('depth',25);U.value=store.get('dur',22);V.checked=store.get('v',false);
const cs=getComputedStyle(document.body);
const acc=cs.getPropertyValue('--color-accent')||'#5af',txt=cs.getPropertyValue('--color-text')||'#ddd',bd=cs.getPropertyValue('--color-border')||'#555',sec=cs.getPropertyValue('--color-text-secondary')||'#999';
function flux(x,depth,dur,vsh){const r=Math.abs(x)/dur;if(r>=1)return 1;return 1-depth*(vsh?(1-r):(1-Math.pow(r,8)));}
function draw(){
  const dpr=window.devicePixelRatio||1;
  const W=cv.width=Math.max(1,cv.clientWidth)*dpr,H=cv.height=220*dpr,pad=36*dpr;
  ctx.clearRect(0,0,W,H);
  const depth=D.value/1000,dur=U.value/100,vsh=V.checked;
  const fmax=1.002,fmin=1-depth-0.01;
  const X=x=>pad+(W-2*pad)*(x+1)/2, Yf=f=>pad/2+(H-pad-pad/2)*(1-(f-fmin)/(fmax-fmin));
  ctx.strokeStyle=bd;ctx.globalAlpha=.6;ctx.lineWidth=1*dpr;
  ctx.beginPath();ctx.moveTo(pad,pad/2);ctx.lineTo(pad,H-pad);ctx.lineTo(W-pad,H-pad);ctx.stroke();ctx.globalAlpha=1;
  ctx.strokeStyle=sec;ctx.setLineDash([4*dpr,4*dpr]);ctx.globalAlpha=.5;
  ctx.beginPath();ctx.moveTo(pad,Yf(1));ctx.lineTo(W-pad,Yf(1));ctx.stroke();ctx.setLineDash([]);ctx.globalAlpha=1;
  ctx.strokeStyle=acc;ctx.lineWidth=2*dpr;ctx.beginPath();
  for(let i=0;i<=300;i++){const x=-1+2*i/300,f=flux(x,depth,dur,vsh),px=X(x),py=Yf(f);i?ctx.lineTo(px,py):ctx.moveTo(px,py);}
  ctx.stroke();
  const cx=X(0),topY=Yf(1),botY=Yf(1-depth);
  ctx.strokeStyle=txt;ctx.globalAlpha=.85;ctx.lineWidth=1*dpr;
  ctx.beginPath();ctx.moveTo(cx,topY);ctx.lineTo(cx,botY);ctx.stroke();
  ctx.fillStyle=txt;ctx.font=`${12*dpr}px system-ui`;ctx.fillText('depth',cx+6*dpr,(topY+botY)/2);
  const xl=X(-dur),xr=X(dur),yb=Yf(1-depth*0.5);
  ctx.beginPath();ctx.moveTo(xl,yb);ctx.lineTo(xr,yb);ctx.stroke();ctx.globalAlpha=1;
  ctx.fillText('duration',(xl+xr)/2-20*dpr,yb-6*dpr);
  ctx.fillStyle=sec;ctx.fillText('brightness',4*dpr,pad/2-4*dpr);ctx.fillText('time',W-pad-16*dpr,H-pad+18*dpr);
  cap.textContent=vsh?'A V-shape with no flat bottom usually means two stars eclipsing, a common false positive, not a planet.':'A U-shape with a flat bottom is the planet signature. A deeper dip means a bigger planet.';
  store.set('depth',D.value);store.set('dur',U.value);store.set('v',V.checked);
}
[D,U,V].forEach(e=>e.addEventListener('input',draw));
new ResizeObserver(draw).observe(cv);draw();
</script>

3. The problem splits into two layers

In Mallorn you had one job: given a transient, classify it by its pattern. Here it splits into two stages.

4. Layer 1, finding the needle in 20,000 light curves

You never open files by hand. You use two tricks.

Phase folding

A single transit is faint and easy to miss. But it repeats. If you fold the time series at the correct period, every transit stacks on top of the others and a weak dip becomes obvious. At the wrong period the dips land at random phases and wash out. Move the slider until it snaps together.

<div style="font-family:system-ui;color:var(--color-text);width:100%">
  <canvas id=cv style="width:100%;height:300px;display:block"></canvas>
  <div style="display:flex;flex-wrap:wrap;gap:16px;margin-top:8px;font-size:13px">
    <label>Trial period <input id=p type=range min=300 max=1000 value=500> <span id=pv></span> days</label>
    <button id=snap>Snap to true period</button>
  </div>
  <div id=cap style="margin-top:6px;color:var(--color-text-secondary);font-size:12px"></div>
</div>
<script>
const cv=document.getElementById('cv'),ctx=cv.getContext('2d');
const P=document.getElementById('p'),pv=document.getElementById('pv'),cap=document.getElementById('cap'),snap=document.getElementById('snap');
const cs=getComputedStyle(document.body);
const acc=cs.getPropertyValue('--color-accent')||'#5af',ok=cs.getPropertyValue('--color-success')||'#3c8',bd=cs.getPropertyValue('--color-border')||'#555',sec=cs.getPropertyValue('--color-text-secondary')||'#999';
const Ptrue=6.27,T=27,N=900,t0=1.5,depth=0.03,dur=0.18;
function g(){return (Math.random()+Math.random()+Math.random()+Math.random()-2)/2;}
const ts=[],fs=[];
for(let i=0;i<N;i++){const t=T*i/N;let f=1;const ph=((t-t0)%Ptrue+Ptrue)%Ptrue,d=Math.min(ph,Ptrue-ph);if(d<dur)f-=depth*(1-Math.pow(d/dur,8));f+=g()*0.006;ts.push(t);fs.push(f);}
P.value=store.get('p',500);
function draw(){
  const dpr=window.devicePixelRatio||1;
  const W=cv.width=Math.max(1,cv.clientWidth)*dpr,H=cv.height=300*dpr,pad=28*dpr,mid=H*0.46;
  ctx.clearRect(0,0,W,H);
  const Pt=P.value/100;pv.textContent=Pt.toFixed(2);
  const fmin=Math.min(...fs)-0.003,fmax=Math.max(...fs)+0.003;
  ctx.fillStyle=sec;ctx.font=`${11*dpr}px system-ui`;
  ctx.fillText('Raw light curve, dips repeat at the true period',pad,13*dpr);
  ctx.fillStyle=acc;
  for(let i=0;i<N;i++){const px=pad+(W-2*pad)*ts[i]/T,py=18*dpr+(mid-26*dpr)*(1-(fs[i]-fmin)/(fmax-fmin));ctx.fillRect(px,py,1.4*dpr,1.4*dpr);}
  const near=Math.abs(Pt-Ptrue)<0.05;
  ctx.fillStyle=sec;ctx.fillText('Folded at trial period, align the dips into one transit',pad,mid+16*dpr);
  ctx.fillStyle=near?ok:acc;
  for(let i=0;i<N;i++){let ph=(((ts[i]-t0)%Pt)+Pt)%Pt;ph=ph/Pt;if(ph>0.5)ph-=1;const px=pad+(W-2*pad)*(ph+0.5),py=mid+26*dpr+(H-pad-mid-26*dpr)*(1-(fs[i]-fmin)/(fmax-fmin));ctx.fillRect(px,py,1.7*dpr,1.7*dpr);}
  cap.textContent=near?'Aligned. At the correct period every transit stacks together, so a faint dip becomes obvious. This is how a weak signal is pulled out of the noise.':'Smeared. At the wrong period the dips land at random phases and wash out. The search tries many periods until the dip snaps together.';
  store.set('p',P.value);
}
P.addEventListener('input',draw);
snap.addEventListener('click',()=>{P.value=Math.round(Ptrue*100);draw();});
new ResizeObserver(draw).observe(cv);draw();
</script>

The BLS periodogram, the actual search

Box Least Squares does the folding above for thousands of trial periods automatically and scores each one. A real periodic dip produces a sharp peak. Pure noise produces no clean peak. So for a sector of 20,000 to 30,000 stars you run this on every star and rank them by peak height. Only the few hundred with a real peak rise above the noise. Raise the noise and watch the peak drown, which is exactly the detection limit.

<div style="font-family:system-ui;color:var(--color-text);width:100%">
  <canvas id=cv style="width:100%;height:230px;display:block"></canvas>
  <div style="display:flex;flex-wrap:wrap;gap:16px;margin-top:8px;font-size:13px">
    <label>Noise level <input id=n type=range min=0 max=100 value=35> <span id=nv></span></label>
    <button id=regen>New star</button>
  </div>
  <div id=cap style="margin-top:6px;color:var(--color-text-secondary);font-size:12px"></div>
</div>
<script>
const cv=document.getElementById('cv'),ctx=cv.getContext('2d');
const NI=document.getElementById('n'),nv=document.getElementById('nv'),cap=document.getElementById('cap'),regen=document.getElementById('regen');
const cs=getComputedStyle(document.body);
const acc=cs.getPropertyValue('--color-accent')||'#5af',ok=cs.getPropertyValue('--color-success')||'#3c8',txt=cs.getPropertyValue('--color-text')||'#ddd',bd=cs.getPropertyValue('--color-border')||'#555',sec=cs.getPropertyValue('--color-text-secondary')||'#999';
const Ptrue=6.27,T=27,N=700,t0=1.5,depth=0.025,dur=0.16;
function g(){return (Math.random()+Math.random()+Math.random()+Math.random()-2)/2;}
let ts=[],fs=[],res;
function gen(noise){ts=[];fs=[];for(let i=0;i<N;i++){const t=T*i/N;let f=1;const ph=((t-t0)%Ptrue+Ptrue)%Ptrue,d=Math.min(ph,Ptrue-ph);if(d<dur)f-=depth*(1-Math.pow(d/dur,8));f+=g()*noise;ts.push(t);fs.push(f);}}
function bls(){const Pmin=2,Pmax=11,M=280,nb=24,periods=[],powers=[];
  for(let kk=0;kk<M;kk++){const Pp=Pmin+(Pmax-Pmin)*kk/(M-1);periods.push(Pp);
    const bins=new Array(nb).fill(0),cnt=new Array(nb).fill(0);
    for(let i=0;i<N;i++){let ph=(((ts[i]-t0)%Pp)+Pp)%Pp,b=Math.floor(ph/Pp*nb)%nb;bins[b]+=fs[i];cnt[b]++;}
    let mean=0,tot=0;for(let b=0;b<nb;b++)if(cnt[b]){bins[b]/=cnt[b];mean+=bins[b];tot++;}mean/=tot;
    let mn=Infinity;for(let b=0;b<nb;b++)if(cnt[b]&&bins[b]<mn)mn=bins[b];
    powers.push(Math.max(0,mean-mn));}
  return {periods,powers};}
function draw(){
  const dpr=window.devicePixelRatio||1;
  const W=cv.width=Math.max(1,cv.clientWidth)*dpr,H=cv.height=230*dpr,pad=34*dpr;
  ctx.clearRect(0,0,W,H);
  const pmax=Math.max(...res.powers),pidx=res.powers.indexOf(pmax),pPeak=res.periods[pidx];
  ctx.strokeStyle=bd;ctx.globalAlpha=.6;ctx.beginPath();ctx.moveTo(pad,8*dpr);ctx.lineTo(pad,H-pad);ctx.lineTo(W-pad,H-pad);ctx.stroke();ctx.globalAlpha=1;
  ctx.strokeStyle=acc;ctx.lineWidth=1.6*dpr;ctx.beginPath();
  res.powers.forEach((p,i)=>{const px=pad+(W-2*pad)*i/(res.powers.length-1),py=8*dpr+(H-pad-8*dpr)*(1-p/(pmax||1));i?ctx.lineTo(px,py):ctx.moveTo(px,py);});
  ctx.stroke();
  const peakX=pad+(W-2*pad)*pidx/(res.powers.length-1);
  ctx.strokeStyle=ok;ctx.setLineDash([4*dpr,3*dpr]);ctx.beginPath();ctx.moveTo(peakX,8*dpr);ctx.lineTo(peakX,H-pad);ctx.stroke();ctx.setLineDash([]);
  ctx.fillStyle=txt;ctx.font=`${12*dpr}px system-ui`;ctx.fillText('peak at P = '+pPeak.toFixed(2)+' days',Math.min(peakX+6*dpr,W-pad-130*dpr),20*dpr);
  ctx.fillStyle=sec;ctx.font=`${11*dpr}px system-ui`;ctx.fillText('BLS power',6*dpr,16*dpr);ctx.fillText('trial period (days)',W-pad-96*dpr,H-pad+18*dpr);
  const det=Math.abs(pPeak-Ptrue)<0.2;
  cap.textContent=det?'A clear peak near 6.27 days means this star has a periodic dip. Rank every star in the sector by this peak height and only the real ones stand out.':'No clean peak stands out, so at this noise level the signal is buried. This star ranks low and is set aside. Lower the noise or draw a new star.';
}
function rebuild(){const noise=NI.value/8000;nv.textContent=(noise*100).toFixed(2)+'%';gen(noise);res=bls();draw();store.set('n',NI.value);}
NI.addEventListener('input',rebuild);regen.addEventListener('click',rebuild);
NI.value=store.get('n',35);rebuild();
new ResizeObserver(draw).observe(cv);
</script>

5. Layer 2, telling planets from imposters

This is your Mallorn instinct exactly. You were right that every phenomenon leaves a different pattern. Here the pattern lives in the dip shape and its repeat behaviour, and we turn each pattern into a feature.

PhenomenonTell tale pattern, which becomes a feature
Planet transitshallow, flat bottomed U, same depth each time, no secondary dip
Eclipsing binaryoften V shaped, a secondary eclipse halfway, odd and even depths differ
Blend or contaminationdepth diluted by a neighbour star, centroid shifts
Artefact or stellar noisenot cleanly periodic, or lines up with the spacecraft cadence

The key EDA picture, do the classes separate

Take two engineered features and plot every detected dip. If planets land in one region and eclipsing binaries in another, a gradient boosted model can learn the boundary. If everything overlapped, no model could work. This single plot is the test of whether the whole approach is viable, and it is the figure that makes the proposal credible.

<div style="font-family:system-ui;color:var(--color-text)">
  <canvas id=c></canvas>
  <div id=cap style="margin-top:8px;color:var(--color-text-secondary);font-size:12px"></div>
</div>
<script>
const cs=getComputedStyle(document.body);
const ok=cs.getPropertyValue('--color-success')||'#3c8',er=cs.getPropertyValue('--color-error')||'#e55',sec=cs.getPropertyValue('--color-text-secondary')||'#999',txt=cs.getPropertyValue('--color-text')||'#ddd',grid=cs.getPropertyValue('--color-border')||'#444';
function cl(n,mx,sx,my,sy){const a=[];for(let i=0;i<n;i++){let x=mx+(Math.random()+Math.random()+Math.random()-1.5)*sx,y=my+(Math.random()+Math.random()+Math.random()-1.5)*sy;x=Math.max(0,Math.min(1,x));y=Math.max(0,y);a.push({x:+x.toFixed(3),y:+y.toFixed(3)});}return a;}
new Chart(document.getElementById('c'),{type:'scatter',
 data:{datasets:[
  {label:'Planet transit (U-shape, no secondary)',data:cl(70,0.86,0.10,0.02,0.03),backgroundColor:ok,pointRadius:3},
  {label:'Eclipsing binary (V-shape, has secondary)',data:cl(70,0.18,0.12,0.55,0.22),backgroundColor:er,pointRadius:3},
  {label:'Noise or artefact (low SNR)',data:cl(60,0.5,0.34,0.35,0.30),backgroundColor:sec,pointRadius:2}
 ]},
 options:{scales:{
   x:{title:{display:true,text:'shape statistic   (0 is V-shape  ...  1 is flat-bottom U)',color:txt},min:0,max:1,grid:{color:grid},ticks:{color:sec}},
   y:{title:{display:true,text:'secondary-eclipse depth (relative)',color:txt},min:0,grid:{color:grid},ticks:{color:sec}}},
  plugins:{legend:{labels:{color:txt}}}}
});
document.getElementById('cap').textContent='Each dot is one detected dip described by two features. Planets cluster lower right, flat bottom and no secondary eclipse. Eclipsing binaries sit upper left, V-shape with a secondary. Because the classes occupy different regions, a classifier can learn the boundary.';
</script>

6. The 440 GB TIC is a red herring

What that big download is: the TESS Input Catalog is a phone book of stars, about 1.7 billion rows with position, brightness, star radius, temperature and contamination. It has no time data, so you cannot detect a single transit from it. You do not train on it and you never need the whole thing.

What you actually do with it: look up a few rows for your candidate stars, to convert depth into planet size (star radius) and to flag crowded stars (the contamination column, which is your blend signal). That is a few thousand rows through the MAST query service, not 440 GB.

This is the trap half the field falls into. They download the catalog thinking it is the dataset, choke on it, and never reach the light curves. We skip it.

7. How much data we actually need

Three different things, do not confuse them.

PurposeWhatSize
Train the classifierthe curated labelled set ISRO provides, plus public confirmed planets and known binariessmall, a few thousand examples, not gigabytes
Run detection and demoone sector of light curves, about 20,000 to 30,000 starsabout 30 to 60 GB raw, reduced to features at once
Metadata lookupsTIC rows for your candidates onlya few megabytes

The size that matters for training is the number and variety of labelled examples, not gigabytes. A feature based gradient boosted model does not need millions of curves. Mallorn worked with about ten thousand and that was plenty.

8. Is the data enough to generalise

This is the real machine learning question, and the EDA answers it with three checks.

  1. Feature separability. The scatter in section 5. If planets and binaries separate in feature space, the model can learn it.
  2. Coverage. Does the labelled training set span the same star brightness and period range as the sector you will apply to. If you train only on bright stars and apply to faint noisy ones it will not generalise. This is where the TIC metadata earns its place, to compare the two populations.
  3. Detection floor. You cannot detect arbitrarily small planets in noisy data. The transit signal to noise grows with depth and with the number of transits observed. Below a threshold of about seven, detection fails. The slider below shows where that floor sits.
<div style="font-family:system-ui;color:var(--color-text);width:100%">
  <div style="display:flex;flex-wrap:wrap;gap:16px;font-size:13px;margin-bottom:10px">
    <label>Planet depth <input id=d type=range min=2 max=60 value=20> <span id=dv></span></label>
    <label>Noise per point <input id=n type=range min=1 max=60 value=22> <span id=nv></span></label>
    <label>Points in transit <input id=k type=range min=5 max=200 value=60> <span id=kv></span></label>
  </div>
  <div id=out style="font-size:15px"></div>
  <div id=cap style="margin-top:6px;color:var(--color-text-secondary);font-size:12px"></div>
</div>
<script>
const d=document.getElementById('d'),n=document.getElementById('n'),k=document.getElementById('k');
const dv=document.getElementById('dv'),nv=document.getElementById('nv'),kv=document.getElementById('kv'),out=document.getElementById('out'),cap=document.getElementById('cap');
const cs=getComputedStyle(document.body);
const ok=cs.getPropertyValue('--color-success')||'#3c8',er=cs.getPropertyValue('--color-error')||'#e55';
d.value=store.get('d',20);n.value=store.get('n',22);k.value=store.get('k',60);
function calc(){
  const depth=+d.value,noise=+n.value,K=+k.value;
  const snr=depth/(noise/Math.sqrt(K));
  dv.textContent=(depth/10).toFixed(1);nv.textContent=(noise/10).toFixed(1);kv.textContent=K;
  const det=snr>=7;
  out.innerHTML='Transit signal to noise = <b>'+snr.toFixed(1)+'</b>   '+(det?'<span style="color:'+ok+'">detectable</span>':'<span style="color:'+er+'">below the floor</span>');
  cap.textContent=det?'Above the usual threshold near seven. A small planet still becomes detectable if it transits enough times, even when single points are noisy. This is why more observations help.':'Below threshold. The planet is too small, the star too noisy, or too few transits were seen. This boundary defines what generalised detection can and cannot reach.';
  store.set('d',d.value);store.set('n',n.value);store.set('k',k.value);
}
[d,n,k].forEach(e=>e.addEventListener('input',calc));calc();
</script>

If those three hold, the model generalises within that stated floor. That is the honest, defensible answer, and none of it needs the 440 GB or even a full sector to demonstrate. It needs the labelled set plus a sample.

9. What this means for the proposal

Back to ps7-exoplanet-light-curves and the decision-ps3-vs-ps7.