Back to home page

Quest Cross Reference

 
 

    


Warning, cross-references for /kernel/sched/vcpu.c need to be fixed.

0001 /*                    The Quest Operating System
0002  *  Copyright (C) 2005-2010  Richard West, Boston University
0003  *
0004  *  This program is free software: you can redistribute it and/or modify
0005  *  it under the terms of the GNU General Public License as published by
0006  *  the Free Software Foundation, either version 3 of the License, or
0007  *  (at your option) any later version.
0008  *
0009  *  This program is distributed in the hope that it will be useful,
0010  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
0011  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
0012  *  GNU General Public License for more details.
0013  *
0014  *  You should have received a copy of the GNU General Public License
0015  *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
0016  */
0017 
0018 /* ************************************************** */
0019 
0020 #include "sched/vcpu.h"
0021 #include "sched/sched.h"
0022 #include "arch/i386-percpu.h"
0023 #include "arch/i386-div64.h"
0024 #include "util/perfmon.h"
0025 #include "util/cpuid.h"
0026 #include "smp/smp.h"
0027 #include "smp/apic.h"
0028 #include "smp/spinlock.h"
0029 #include "util/debug.h"
0030 #include "util/printf.h"
0031 #include "mem/pow2.h"
0032 
0033 #define UNITS_PER_SEC 1000
0034 
0035 //#define DEBUG_VCPU
0036 //#define DUMP_STATS_VERBOSE
0037 //#define DUMP_STATS_VERBOSE_2
0038 //#define CHECK_INVARIANTS
0039 
0040 /* Use sporadic servers for I/O VCPUs */
0041 //#define SPORADIC_IO
0042 
0043 #ifdef DEBUG_VCPU
0044 #define DLOG(fmt,...) DLOG_PREFIX("vcpu",fmt,##__VA_ARGS__)
0045 #else
0046 #define DLOG(fmt,...) ;
0047 #endif
0048 
0049 u32 tsc_freq_msec, tsc_unit_freq;
0050 u64 vcpu_init_time;
0051 
0052 struct vcpu_params { vcpu_type type; u32 C, T; iovcpu_class class; };
0053 static struct vcpu_params init_params[] = {
0054   { MAIN_VCPU, 20, 100 },
0055   { MAIN_VCPU, 20, 100 },
0056   { MAIN_VCPU, 20, 100 },
0057   { MAIN_VCPU, 20, 100 },
0058 
0059 #ifdef SPORADIC_IO
0060   { IO_VCPU, 10, 300, IOVCPU_CLASS_USB },
0061   { IO_VCPU, 10, 400, IOVCPU_CLASS_ATA },
0062   { IO_VCPU, 10, 500, IOVCPU_CLASS_NET },
0063 #else
0064   { IO_VCPU, 1, 10, IOVCPU_CLASS_USB },
0065   { IO_VCPU, 1, 10, IOVCPU_CLASS_ATA },
0066   { IO_VCPU, 1, 10, IOVCPU_CLASS_NET },
0067 #endif
0068 };
0069 #define NUM_VCPUS (sizeof (init_params) / sizeof (struct vcpu_params))
0070 static vcpu vcpus[NUM_VCPUS] ALIGNED (VCPU_ALIGNMENT);
0071 
0072 extern uint
0073 lowest_priority_vcpu (void)
0074 {
0075   uint i, n=0, T=0;
0076   for (i=0; i<NUM_VCPUS; i++) {
0077     if (init_params[i].type == MAIN_VCPU && init_params[i].T >= T) {
0078       T = init_params[i].T;
0079       n = i;
0080     }
0081   }
0082   return n;
0083 }
0084 
0085 vcpu *
0086 vcpu_lookup (int i)
0087 {
0088   if (0 <= i && i < NUM_VCPUS)
0089     return &vcpus[i];
0090   return NULL;
0091 }
0092 
0093 int
0094 vcpu_index (vcpu *v)
0095 {
0096   return (((uint) v) - ((uint) &vcpus)) / sizeof (vcpu);
0097 }
0098 
0099 void
0100 vcpu_lock (vcpu *vcpu)
0101 {
0102   spinlock_lock (&vcpu->lock);
0103 }
0104 
0105 void
0106 vcpu_unlock (vcpu *vcpu)
0107 {
0108   spinlock_unlock (&vcpu->lock);
0109 }
0110 
0111 /* locked functions */
0112 
0113 bool
0114 vcpu_in_runqueue (vcpu *vcpu, task_id task)
0115 {
0116   task_id i = vcpu->runqueue;
0117 
0118   while (i != 0) {
0119     if (task == i) return TRUE;
0120     i = lookup_TSS (i)->next;
0121   }
0122   return task == i;
0123 }
0124 
0125 void
0126 vcpu_remove_from_runqueue (vcpu *vcpu, task_id task)
0127 {
0128   task_id *q = &vcpu->runqueue;
0129   while (*q != 0) {
0130     if (*q == task) {
0131       *q = lookup_TSS (*q)->next;
0132       return;
0133     }
0134     q = &lookup_TSS (*q)->next;
0135   }
0136 }
0137 
0138 void
0139 vcpu_internal_schedule (vcpu *vcpu)
0140 {
0141   u64 now = vcpu->virtual_tsc;
0142 
0143   if (vcpu->next_schedule == 0 || vcpu->next_schedule <= now)
0144     goto sched;
0145 
0146   if (vcpu_in_runqueue (vcpu, vcpu->tr) == TRUE) {
0147     /* keep vcpu->tr running, remove from runqueue */
0148     vcpu_remove_from_runqueue (vcpu, vcpu->tr);
0149   } else
0150     goto sched;
0151 
0152   return;
0153 
0154  sched:
0155   /* round-robin */
0156   vcpu->tr = queue_remove_head (&vcpu->runqueue);
0157   vcpu->next_schedule = now + vcpu->quantum;
0158 }
0159 
0160 void
0161 vcpu_runqueue_append (vcpu *vcpu, task_id task)
0162 {
0163   queue_append (&vcpu->runqueue, task);
0164 }
0165 
0166 #define preserve_segment(next)                                  \
0167   {                                                             \
0168     tss *tssp = (tss *)lookup_TSS (next);                       \
0169     u16 sel;                                                    \
0170     asm volatile ("movw %%"PER_CPU_SEG_STR", %0":"=r" (sel));   \
0171     tssp->usFS = sel;                                           \
0172   }
0173 
0174 #define switch_to(next) software_context_switch (next)
0175 
0176 void
0177 vcpu_switch_to (vcpu *vcpu)
0178 {
0179   switch_to (vcpu->tr);
0180 }
0181 
0182 bool
0183 vcpu_is_idle (vcpu *vcpu)
0184 {
0185   return vcpu->tr == 0;
0186 }
0187 
0188 void
0189 vcpu_queue_append (vcpu **queue, vcpu *vcpu)
0190 {
0191   while (*queue) {
0192     if (*queue == vcpu)
0193       /* already on queue */
0194       return;
0195     queue = &((*queue)->next);
0196   }
0197   vcpu->next = NULL;
0198   *queue = vcpu;
0199 }
0200 
0201 vcpu *
0202 vcpu_queue_remove_head (vcpu **queue)
0203 {
0204   vcpu *vcpu = NULL;
0205   if (*queue) {
0206     vcpu = *queue;
0207     *queue = (*queue)->next;
0208   }
0209   return vcpu;
0210 }
0211 
0212 /* ************************************************** */
0213 
0214 DEF_PER_CPU (vcpu *, vcpu_queue);
0215 INIT_PER_CPU (vcpu_queue) {
0216   percpu_write (vcpu_queue, NULL);
0217 }
0218 
0219 DEF_PER_CPU (vcpu *, vcpu_current);
0220 INIT_PER_CPU (vcpu_current) {
0221   percpu_write (vcpu_current, NULL);
0222 }
0223 
0224 DEF_PER_CPU (task_id, vcpu_idle_task);
0225 INIT_PER_CPU (vcpu_idle_task) {
0226   percpu_write (vcpu_idle_task, 0);
0227 }
0228 
0229 /* task accounting */
0230 u64
0231 vcpu_current_vtsc (void)
0232 {
0233   vcpu *v = percpu_read (vcpu_current);
0234   return v->virtual_tsc;
0235 }
0236 
0237 static void
0238 vcpu_acnt_end_timeslice (vcpu *vcpu)
0239 {
0240   u64 now;
0241   int i;
0242 
0243   RDTSC (now);
0244 
0245   if (vcpu->prev_tsc) {
0246     vcpu->timestamps_counted += now - vcpu->prev_tsc;
0247     vcpu->virtual_tsc += now - vcpu->prev_tsc;
0248   }
0249 
0250   for (i=0; i<2; i++) {
0251     u64 value = perfmon_pmc_read (i);
0252     if (vcpu->prev_pmc[i])
0253       vcpu->pmc_total[i] += value - vcpu->prev_pmc[i];
0254   }
0255 }
0256 
0257 static void
0258 vcpu_acnt_begin_timeslice (vcpu *vcpu)
0259 {
0260   u64 now;
0261   int i;
0262 
0263   RDTSC (now);
0264   vcpu->prev_tsc = now;
0265 
0266   for (i=0; i<2; i++) {
0267     u64 value = perfmon_pmc_read (i);
0268     vcpu->prev_pmc[i] = value;
0269   }
0270 }
0271 
0272 extern void
0273 vcpu_rr_schedule (void)
0274 {
0275   task_id next = 0;
0276   vcpu
0277     *queue = percpu_read (vcpu_queue),
0278     *cur   = percpu_read (vcpu_current),
0279     *vcpu  = NULL;
0280 
0281   if (cur)
0282     /* handle end-of-timeslice accounting */
0283     vcpu_acnt_end_timeslice (cur);
0284   if (queue) {
0285     /* get next vcpu from queue */
0286     vcpu = vcpu_queue_remove_head (&queue);
0287     /* perform 2nd-level scheduling */
0288     vcpu_internal_schedule (vcpu);
0289     next = vcpu->tr;
0290     /* if vcpu still has a runqueue, put it back on 1st-level queue */
0291     if (vcpu->runqueue)
0292       vcpu_queue_append (&queue, vcpu);
0293     percpu_write (vcpu_queue, queue);
0294     percpu_write (vcpu_current, vcpu);
0295     DLOG ("vcpu_schedule: pcpu=%d vcpu=%p vcpu->tr=0x%x ->runqueue=0x%x next=0x%x",
0296           get_pcpu_id (), vcpu, vcpu->tr, vcpu->runqueue, next);
0297   }
0298   if (vcpu)
0299     /* handle beginning-of-timeslice accounting */
0300     vcpu_acnt_begin_timeslice (vcpu);
0301   if (next == 0) {
0302     /* no task selected, go idle */
0303     next = percpu_read (vcpu_idle_task);
0304     percpu_write (vcpu_current, NULL);
0305   }
0306   if (next == 0) {
0307     /* workaround: vcpu_idle_task was not initialized yet */
0308     next = idleTSS_selector[get_pcpu_id ()];
0309     percpu_write (vcpu_idle_task, next);
0310   }
0311 
0312   /* switch to new task or continue running same task */
0313   if (str () == next)
0314     return;
0315   else
0316     switch_to (next);
0317 }
0318 
0319 extern void
0320 vcpu_rr_wakeup (task_id task)
0321 {
0322   DLOG ("vcpu_wakeup (0x%x), cpu=%d", task, get_pcpu_id ());
0323   quest_tss *tssp = lookup_TSS (task);
0324   static int next_vcpu_binding = 1;
0325 
0326   if (tssp->cpu == 0xFF) {
0327     do {
0328       tssp->cpu = next_vcpu_binding;
0329       next_vcpu_binding++;
0330       if (next_vcpu_binding >= NUM_VCPUS)
0331         next_vcpu_binding = 0;
0332     } while  (vcpu_lookup (tssp->cpu)->type != MAIN_VCPU);
0333     logger_printf ("vcpu: task 0x%x now bound to vcpu=%d\n", task, tssp->cpu);
0334   }
0335 
0336   vcpu *vcpu = vcpu_lookup (tssp->cpu);
0337 
0338   /* put task on vcpu runqueue (2nd level) */
0339   vcpu_runqueue_append (vcpu, task);
0340 
0341   /* put vcpu on pcpu queue (1st level) */
0342   vcpu_queue_append (percpu_pointer (vcpu->cpu, vcpu_queue), vcpu);
0343 
0344   if (!vcpu->runnable && !vcpu->running && vcpu->hooks->unblock)
0345     vcpu->hooks->unblock (vcpu);
0346   vcpu->runnable = TRUE;
0347 }
0348 
0349 /* ************************************************** */
0350 
0351 DEF_PER_CPU (u64, pcpu_tprev);
0352 INIT_PER_CPU (pcpu_tprev) {
0353   percpu_write64 (pcpu_tprev, 0LL);
0354 }
0355 DEF_PER_CPU (s64, pcpu_overhead);
0356 INIT_PER_CPU (pcpu_overhead) {
0357   percpu_write64 (pcpu_overhead, 0LL);
0358 }
0359 DEF_PER_CPU (u64, pcpu_idle_time);
0360 INIT_PER_CPU (pcpu_idle_time) {
0361   percpu_write64 (pcpu_idle_time, 0LL);
0362 }
0363 DEF_PER_CPU (u64, pcpu_idle_prev_tsc);
0364 INIT_PER_CPU (pcpu_idle_prev_tsc) {
0365   percpu_write64 (pcpu_idle_prev_tsc, 0LL);
0366 }
0367 DEF_PER_CPU (u32, pcpu_sched_time);
0368 INIT_PER_CPU (pcpu_sched_time) {
0369   percpu_write (pcpu_sched_time, 0);
0370 }
0371 
0372 static void
0373 idle_time_acnt_begin ()
0374 {
0375   u64 now;
0376 
0377   RDTSC (now);
0378   percpu_write64 (pcpu_idle_prev_tsc, now);
0379 }
0380 
0381 static void
0382 idle_time_acnt_end ()
0383 {
0384   u64 idle_time = percpu_read64 (pcpu_idle_time);
0385   u64 idle_prev_tsc = percpu_read64 (pcpu_idle_prev_tsc);
0386   u64 now;
0387 
0388   RDTSC (now);
0389 
0390   if (idle_prev_tsc)
0391     idle_time += now - idle_prev_tsc;
0392   percpu_write64 (pcpu_idle_time, idle_time);
0393 }
0394 
0395 static inline u32
0396 compute_percentage (u64 overall, u64 usage)
0397 {
0398   u64 res = div64_64 (usage * 10000, overall);
0399   u16 whole, frac;
0400 
0401   whole = ((u16) res) / 100;
0402   frac = ((u16) res) - whole * 100;
0403   return (((u32) whole) << 16) | (u32) frac;
0404 }
0405 
0406 extern void
0407 vcpu_dump_stats (void)
0408 {
0409   static u32 dump_count = 0;
0410   int i;
0411 #ifdef DUMP_STATS_VERBOSE_2
0412   vcpu *cur = percpu_read (vcpu_current);
0413 #endif
0414   s64 overhead = percpu_read64 (pcpu_overhead);
0415   u64 idle_time = percpu_read64 (pcpu_idle_time);
0416   u64 sum = idle_time;
0417   u32 stime = percpu_read (pcpu_sched_time);
0418   extern u32 uhci_sample_bps (void);
0419   extern u32 atapi_sample_bps (void);
0420   u32 uhci_bps = uhci_sample_bps ();
0421   u32 atapi_bps = atapi_sample_bps ();
0422   u64 now; RDTSC (now);
0423 
0424   logger_printf ("vcpu_dump_stats n=%d t=0x%llX ms=0x%X\n",
0425                  dump_count++, now, tsc_freq_msec);
0426 
0427   now -= vcpu_init_time;
0428   RDTSC (vcpu_init_time);
0429 
0430   u32 sched = compute_percentage (now, stime);
0431   logger_printf ("  overhead=0x%llX sched=%02d.%02d uhci_bps=%d atapi_bps=%d\n",
0432                  overhead,
0433                  sched >> 16, sched & 0xFF,
0434                  uhci_bps, atapi_bps);
0435 #define DUMP_CACHE_STATS
0436 #ifdef DUMP_CACHE_STATS
0437   vcpu *queue = percpu_read (vcpu_queue);
0438   vcpu **ptr = NULL;
0439 
0440   if (queue) {
0441     for (ptr = &queue; *ptr != NULL; ptr = &(*ptr)->next) {
0442       logger_printf ("vcpu=%X pcpu=%d cache occupancy=%llX mpki=%llX\n type=%d",
0443                      (uint32) *ptr, (*ptr)->cpu, (*ptr)->cache_occupancy,
0444                      (*ptr)->mpki, (*ptr)->type);
0445     }
0446   }
0447 #endif
0448 #ifdef DUMP_STATS_VERBOSE
0449   extern u64 irq_response, irq_turnaround, irq_resp_max, irq_resp_min;
0450   extern u64 atapi_sector_read_time, atapi_sector_cpu_time;
0451   extern u64 atapi_req_diff;
0452   extern u32 atapi_req_count, ata_irq_count;
0453   extern u32 e1000_packet_count;
0454   extern u64 e1000_packet_bytes;
0455   if (ata_irq_count && atapi_req_count)
0456     logger_printf ("  response=0x%llX responsemax=0x%llX responsemin=0x%llX\n"
0457                    "  readtime=0x%llX readvcpu=0x%llX"
0458                    " avgreqdiff=0x%llX\n",
0459                    div64_64 (irq_response, (u64) ata_irq_count),
0460                    irq_resp_max,
0461                    irq_resp_min,
0462                    div64_64 (atapi_sector_read_time, (u64) atapi_req_count),
0463                    div64_64 (atapi_sector_cpu_time, (u64) atapi_req_count),
0464                    div64_64 (atapi_req_diff, (u64) atapi_req_count));
0465 
0466   extern u64 atapi_count, atapi_cycles, atapi_max, atapi_min;
0467   if (atapi_count)
0468     logger_printf ("  atapiavg=0x%llX atapimax=0x%llX atapimin=0x%llX\n",
0469                    div64_64 (atapi_cycles, atapi_count),
0470                    atapi_max, atapi_min);
0471   atapi_count = atapi_max = atapi_cycles = 0;
0472   atapi_min = ~0LL;
0473 
0474   logger_printf ("  e1000pps=0x%llX e1000bps=0x%llX\n",
0475                  div64_64 ((u64) e1000_packet_count * tsc_freq, now),
0476                  div64_64 (e1000_packet_bytes * tsc_freq, now));
0477   e1000_packet_bytes = 0;
0478   e1000_packet_count = 0;
0479 
0480   /* 5-sec window */
0481   ata_irq_count = 0;
0482   irq_resp_max = irq_turnaround = irq_response = 0;
0483   irq_resp_min = ~0LL;
0484   atapi_req_count = 0;
0485   atapi_req_diff = 0;
0486   atapi_sector_read_time = atapi_sector_cpu_time = 0;
0487 
0488 #ifdef DUMP_STATS_VERBOSE_2
0489   logger_printf ("idle tsc=0x%llX%s\n", idle_time, (cur==NULL ? " (*)" : ""));
0490 #endif
0491 #endif
0492 
0493   percpu_write64 (pcpu_idle_time, 0LL);
0494   percpu_write (pcpu_sched_time, 0);
0495 
0496   for (i=0; i<NUM_VCPUS; i++) {
0497     vcpu *vcpu = &vcpus[i];
0498 #if defined(DUMP_STATS_VERBOSE) && defined (DUMP_STATS_VERBOSE_2)
0499     if (vcpu->type == IO_VCPU) {
0500       logger_printf ("vcpu=%d pcpu=%d tsc=0x%llX pmc[0]=0x%llX pmc[1]=0x%llX%s\n",
0501                      i, vcpu->cpu,
0502                      vcpu->timestamps_counted,
0503                      vcpu->pmc_total[0],
0504                      vcpu->pmc_total[1],
0505                      (vcpu == cur ? " (*)" : ""));
0506       logger_printf ("  b=0x%llX overhead=0x%llX delta=0x%llX usage=0x%X\n",
0507                      vcpu->b, vcpu->sched_overhead, vcpu->prev_delta,
0508                      vcpu->prev_usage);
0509     }
0510 #endif
0511     sum += vcpu->timestamps_counted;
0512   }
0513 
0514   u32 res = compute_percentage (now, idle_time);
0515   logger_printf (" idle=%02d.%02d\n", res >> 16, res & 0xFF);
0516   for (i=0; i<NUM_VCPUS; i++) {
0517     vcpu *vcpu = &vcpus[i];
0518     res = compute_percentage (now, vcpu->timestamps_counted);
0519     vcpu->timestamps_counted = 0;
0520 #ifndef SPORADIC_IO
0521     logger_printf (" V%02d=%02d.%02d %d", i, res >> 16, res & 0xFF,
0522                    vcpu->type != MAIN_VCPU ? 0 : vcpu->main.Q.size);
0523 #else
0524     logger_printf (" V%02d=%02d.%02d %d", i, res >> 16, res & 0xFF,
0525                    vcpu->main.Q.size);
0526 #endif
0527     if ((i % 4) == 3) {
0528       logger_printf ("\n");
0529     }
0530   }
0531   logger_printf ("\nend vcpu_dump_stats\n");
0532 }
0533 
0534 /* ************************************************** */
0535 
0536 void
0537 repl_queue_pop (repl_queue *Q)
0538 {
0539   if (Q->head) {
0540     replenishment *r = Q->head->next;
0541     Q->head->t = Q->head->b = 0;
0542     Q->head->next = NULL;
0543     Q->head = r;
0544     Q->size--;
0545   }
0546 }
0547 
0548 void
0549 repl_queue_add (repl_queue *Q, u64 b, u64 t)
0550 {
0551   if (Q->size < MAX_REPL) {
0552     replenishment *r = NULL, **rq;
0553     int i;
0554     for (i=0; i<MAX_REPL; i++) {
0555       if (Q->array[i].t == 0) {
0556         r = &Q->array[i];
0557         break;
0558       }
0559     }
0560     if (!r) panic ("Q->size < MAX_REPL but no free entry");
0561     rq = &Q->head;
0562     /* find insertion point */
0563     while (*rq && (*rq)->t < t)
0564       rq = &(*rq)->next;
0565     /* insert */
0566     r->next = *rq;
0567     *rq = r;
0568     r->t = t;
0569     r->b = b;
0570     Q->size++;
0571   }
0572 }
0573 
0574 /* ************************************************** */
0575 
0576 #ifdef CHECK_INVARIANTS
0577 static void
0578 check_run_invariants (void)
0579 {
0580   vcpu
0581     *queue = percpu_read (vcpu_queue),
0582     *cur   = percpu_read (vcpu_current),
0583     *v, *q;
0584   int i;
0585   if (cur && !cur->running) panic ("current is not running");
0586   for (i=0; i<NUM_VCPUS; i++) {
0587     v = &vcpus[i];
0588     if (v->running && v != cur)
0589       panic ("vcpu running is not current");
0590     if (v->runnable) {
0591       for (q = queue; q; q = q->next) {
0592         if (q == v) goto ok;
0593       }
0594       panic ("vcpu runnable is not on queue");
0595     ok:;
0596     } else {
0597       for (q = queue; q; q = q->next) {
0598         if (q == v) panic ("vcpu not runnable is on queue");
0599       }
0600     }
0601     if (v->type == MAIN_VCPU) {
0602       if (v->main.Q.size >= MAX_REPL-1)
0603         logger_printf ("vcpu %d has %d repls\n", i, v->main.Q.size);
0604       u64 sum = 0;
0605       replenishment *r;
0606       for (r = v->main.Q.head; r != NULL; r = r->next)
0607         sum += r->b;
0608       if (sum != v->C) {
0609         com1_printf ("v->C=0x%llX sum=0x%llX\n", v->C, sum);
0610         panic ("vcpu replenishments out of whack");
0611       }
0612     }
0613   }
0614 }
0615 #endif
0616 
0617 extern void
0618 vcpu_schedule (void)
0619 {
0620   task_id next = 0;
0621   vcpu
0622     *queue = percpu_read (vcpu_queue),
0623     *cur   = percpu_read (vcpu_current),
0624     *vcpu  = NULL,
0625     **ptr,
0626     **vnext = NULL;
0627   u64 tprev = percpu_read64 (pcpu_tprev);
0628   u64 tcur, tdelta, Tprev = 0, Tnext = 0;
0629   bool timer_set = FALSE;
0630 
0631 #ifdef CHECK_INVARIANTS
0632   check_run_invariants ();
0633 #endif
0634 
0635   RDTSC (tcur);
0636 
0637   tdelta = tcur - tprev;
0638 
0639   DLOG ("tcur=0x%llX tprev=0x%llX tdelta=0x%llX", tcur, tprev, tdelta);
0640 
0641   if (cur) {
0642     Tprev = cur->T;
0643 
0644     /* handle end-of-timeslice accounting */
0645     vcpu_acnt_end_timeslice (cur);
0646 
0647     /* invoke VCPU-specific end of timeslice budgeting */
0648     if (cur->hooks->end_timeslice)
0649       cur->hooks->end_timeslice (cur, tdelta);
0650   } else idle_time_acnt_end ();
0651 
0652   if (queue) {
0653     /* pick highest priority vcpu with available budget */
0654     for (ptr = &queue; *ptr != NULL; ptr = &(*ptr)->next) {
0655       if (Tnext == 0 || (*ptr)->T < Tnext) {
0656         /* update replenishments to refresh budget */
0657         if ((*ptr)->hooks->update_replenishments)
0658           (*ptr)->hooks->update_replenishments (*ptr, tcur);
0659         if ((*ptr)->b > 0) {
0660           Tnext = (*ptr)->T;
0661           vnext = ptr;
0662         }
0663       }
0664     }
0665 
0666     if (vnext) {
0667       vcpu = *vnext;
0668       /* internally schedule */
0669       vcpu_internal_schedule (vcpu);
0670       /* keep vcpu on queue if it has other runnable tasks */
0671       if (vcpu->runqueue == 0) {
0672         /* otherwise, remove it */
0673         *vnext = (*vnext)->next;
0674         vcpu->runnable = FALSE;
0675       }
0676       next = vcpu->tr;
0677 
0678       if (cur != vcpu) {
0679         perfmon_vcpu_acnt_end (cur);
0680       }
0681 
0682       percpu_write (vcpu_current, vcpu);
0683       percpu_write (vcpu_queue, queue);
0684       DLOG ("scheduling vcpu=%p with budget=0x%llX", vcpu, vcpu->b);
0685     } else {
0686       if (cur) {
0687         perfmon_vcpu_acnt_end (cur);
0688       }
0689 
0690       percpu_write (vcpu_current, NULL);
0691     }
0692 
0693     /* find time of next important event */
0694     if (vcpu)
0695       tdelta = vcpu->b;
0696     else
0697       tdelta = 0;
0698     for (ptr = &queue; *ptr != NULL; ptr = &(*ptr)->next) {
0699       if (Tnext == 0 || (*ptr)->T <= Tnext) {
0700         u64 event = 0;
0701         if ((*ptr)->hooks->next_event)
0702           event = (*ptr)->hooks->next_event (*ptr);
0703         if (event != 0 && (tdelta == 0 || event - tcur < tdelta))
0704           tdelta = event - tcur;
0705       }
0706     }
0707 
0708     /* set timer */
0709     if (tdelta > 0) {
0710       u32 count = (u32) div64_64 (tdelta * ((u64) cpu_bus_freq), tsc_freq);
0711       if (count == 0)
0712         count = 1;
0713       if (count > cpu_bus_freq / QUANTUM_HZ)
0714         count = cpu_bus_freq / QUANTUM_HZ;
0715       if (vcpu) {
0716         vcpu->prev_delta = tdelta;
0717         vcpu->prev_count = count;
0718       }
0719       LAPIC_start_timer (count);
0720       timer_set = TRUE;
0721     }
0722   }
0723 
0724   if (!timer_set)
0725     LAPIC_start_timer (cpu_bus_freq / QUANTUM_HZ);
0726 
0727   if (vcpu) {
0728     /* handle beginning-of-timeslice accounting */
0729     vcpu_acnt_begin_timeslice (vcpu);
0730     if (cur != vcpu)
0731       perfmon_vcpu_acnt_start (vcpu);
0732   } else
0733     idle_time_acnt_begin ();
0734   if (next == 0) {
0735     /* no task selected, go idle */
0736     next = percpu_read (vcpu_idle_task);
0737     percpu_write (vcpu_current, NULL);
0738   }
0739   if (next == 0) {
0740     /* workaround: vcpu_idle_task was not initialized yet */
0741     next = idleTSS_selector[get_pcpu_id ()];
0742     percpu_write (vcpu_idle_task, next);
0743   }
0744 
0745   /* current time becomes previous time */
0746   percpu_write64 (pcpu_tprev, tcur);
0747 
0748   /* measure schedule running time */
0749   u64 now; RDTSC (now);
0750   u32 sched_time = percpu_read (pcpu_sched_time);
0751   percpu_write (pcpu_sched_time, sched_time + (u32) (now - tcur));
0752 
0753   if (cur) cur->running = FALSE;
0754   if (vcpu) vcpu->running = TRUE;
0755 
0756   /* switch to new task or continue running same task */
0757   if (str () == next)
0758     return;
0759   else
0760     switch_to (next);
0761 }
0762 
0763 extern void
0764 vcpu_wakeup (task_id task)
0765 {
0766   DLOG ("vcpu_wakeup (0x%x), cpu=%d", task, get_pcpu_id ());
0767   quest_tss *tssp = lookup_TSS (task);
0768   static int next_vcpu_binding = 1;
0769 
0770   /* assign vcpu if not already set */
0771   if (tssp->cpu == 0xFF) {
0772     do {
0773       tssp->cpu = next_vcpu_binding;
0774       next_vcpu_binding++;
0775       if (next_vcpu_binding >= NUM_VCPUS)
0776         next_vcpu_binding = 0;
0777     } while (vcpu_lookup (tssp->cpu)->type != MAIN_VCPU);
0778     com1_printf ("vcpu: task 0x%x now bound to vcpu=%d\n", task, tssp->cpu);
0779   }
0780 
0781   vcpu *v = vcpu_lookup (tssp->cpu);
0782 
0783   /* put task on vcpu runqueue (2nd level) */
0784   vcpu_runqueue_append (v, task);
0785 
0786   /* put vcpu on pcpu queue (1st level) */
0787   vcpu_queue_append (percpu_pointer (v->cpu, vcpu_queue), v);
0788 
0789   if (!v->runnable && !v->running && v->hooks->unblock)
0790     v->hooks->unblock (v);
0791 
0792   v->runnable = TRUE;
0793 
0794   /* check if preemption necessary */
0795   u64 now;
0796   vcpu *cur = percpu_read (vcpu_current);
0797 
0798   RDTSC (now);
0799 
0800   if (v->hooks->update_replenishments)
0801     v->hooks->update_replenishments (v, now);
0802 
0803   if (v->b > 0 && (cur == NULL || cur->T > v->T))
0804     /* preempt */
0805     LAPIC_start_timer (1);
0806 }
0807 
0808 /* ************************************************** */
0809 
0810 /* MAIN_VCPU */
0811 
0812 static inline s64
0813 capacity (vcpu *v)
0814 {
0815   u64 now; RDTSC (now);
0816   if (v->main.Q.head == NULL || v->main.Q.head->t > now)
0817     return 0;
0818   return (s64) v->main.Q.head->b - (s64) v->usage;
0819 }
0820 
0821 static void repl_merge (vcpu *);
0822 
0823 static void
0824 main_vcpu_update_replenishments (vcpu *v, u64 tcur)
0825 {
0826   s64 cap = capacity (v);
0827   v->b = (cap > 0 ? cap : 0);
0828 }
0829 
0830 static u64
0831 main_vcpu_next_event (vcpu *v)
0832 {
0833   replenishment *r;
0834   u64 now; RDTSC (now);
0835   for (r = v->main.Q.head; r != NULL; r = r->next) {
0836     if (now < r->t)
0837       return r->t;
0838   }
0839   return 0;
0840 }
0841 
0842 static void
0843 repl_merge (vcpu *v)
0844 {
0845   /* possibly merge */
0846   while (v->main.Q.size > 1) {
0847     u64 t = v->main.Q.head->t;
0848     u64 b = v->main.Q.head->b;
0849     /* observation 3 */
0850     if (t + b >= v->main.Q.head->next->t) {
0851       repl_queue_pop (&v->main.Q);
0852       v->main.Q.head->b += b;
0853       v->main.Q.head->t = t;
0854     } else
0855       break;
0856   }
0857 }
0858 
0859 static void
0860 budget_check (vcpu *v)
0861 {
0862   if (capacity (v) <= 0) {
0863     while (v->main.Q.head->b <= v->usage) {
0864       /* exhaust and reschedule the replenishment */
0865       v->usage -= v->main.Q.head->b;
0866       u64 b = v->main.Q.head->b, t = v->main.Q.head->t;
0867       repl_queue_pop (&v->main.Q);
0868       t += v->T;
0869       repl_queue_add (&v->main.Q, b, t);
0870     }
0871     if (v->usage > 0) {
0872       /* v->usage is the overrun amount */
0873       v->main.Q.head->t += v->usage;
0874       /* possibly merge */
0875       if (v->main.Q.size > 1) {
0876         u64 t = v->main.Q.head->t;
0877         u64 b = v->main.Q.head->b;
0878         if (t + b >= v->main.Q.head->next->t) {
0879           repl_queue_pop (&v->main.Q);
0880           v->main.Q.head->b += b;
0881           v->main.Q.head->t = t;
0882         }
0883       }
0884     }
0885 #if 0
0886     if (capacity (v) == 0) {
0887       /* S.Q.head.time > Now */
0888       /* if not blocked then S.replenishment.enqueue (S.Q.head.time) */
0889     }
0890 #endif
0891   }
0892 }
0893 
0894 static void
0895 split_check (vcpu *v)
0896 {
0897   u64 now; RDTSC (now);
0898   if (v->usage > 0 && v->main.Q.head->t <= now) {
0899     u64 remnant = v->main.Q.head->b - v->usage;
0900     if (v->main.Q.size == MAX_REPL) {
0901       /* merge with next replenishment */
0902       repl_queue_pop (&v->main.Q);
0903       v->main.Q.head->b += remnant;
0904     } else {
0905       /* leave remnant as reduced replenishment */
0906       v->main.Q.head->b = remnant;
0907     }
0908     repl_queue_add (&v->main.Q, v->usage, v->main.Q.head->t + v->T);
0909     /* invariant: sum of replenishments remains the same */
0910     v->usage = 0;
0911   }
0912 }
0913 
0914 static void
0915 main_vcpu_end_timeslice (vcpu *cur, u64 tdelta)
0916 {
0917   /* timeslice ends for one of 3 reasons: budget depletion,
0918    * preemption, or blocking */
0919 
0920   if (cur->b < tdelta) {
0921     cur->sched_overhead = tdelta - cur->b;
0922     percpu_write64 (pcpu_overhead, cur->sched_overhead);
0923   }
0924 
0925   cur->usage += tdelta;
0926   budget_check (cur);
0927   if (!cur->runnable)
0928     /* blocked */
0929     split_check (cur);
0930 
0931   s64 cap = capacity (cur);
0932   if (cap > 0) {
0933     /* was preempted or blocked */
0934     cur->b = cap;
0935   } else {
0936     /* budget was depleted */
0937     cur->b = 0;
0938     cur->prev_usage = cur->usage;
0939   }
0940 }
0941 
0942 static void
0943 unblock_check (vcpu *v)
0944 {
0945   if (capacity (v) > 0) {
0946     u64 now;
0947     RDTSC (now);
0948     v->main.Q.head->t = now;
0949     /* merge replenishments using observation 3 */
0950     while (v->main.Q.size > 1) {
0951       u64 b = v->main.Q.head->b;
0952       if (v->main.Q.head->next->t <= now + b - v->usage) {
0953         repl_queue_pop (&v->main.Q);
0954         v->main.Q.head->b += b;
0955         v->main.Q.head->t = now;
0956       } else
0957         break;
0958     }
0959   } else {
0960     /* S.replenishment.enqueue (S.Q.head.time) */
0961   }
0962 }
0963 
0964 static void
0965 main_vcpu_unblock (vcpu *v)
0966 {
0967   unblock_check (v);
0968 }
0969 
0970 static vcpu_hooks main_vcpu_hooks = {
0971   .update_replenishments = main_vcpu_update_replenishments,
0972   .next_event = main_vcpu_next_event,
0973   .end_timeslice = main_vcpu_end_timeslice,
0974   .unblock = main_vcpu_unblock
0975 };
0976 
0977 /* IO_VCPU */
0978 
0979 static void
0980 io_vcpu_update_replenishments (vcpu *v, u64 tcur)
0981 {
0982   if (v->io.r.t != 0 && v->io.r.t <= tcur) {
0983     v->b = v->io.r.b;
0984     v->io.r.t = 0;
0985   }
0986 }
0987 
0988 static u64
0989 io_vcpu_next_event (vcpu *v)
0990 {
0991   return v->io.r.t;
0992 }
0993 
0994 static void
0995 io_vcpu_end_timeslice (vcpu *cur, u64 tdelta)
0996 {
0997   u64 u = tdelta;
0998   s64 overhead = percpu_read64 (pcpu_overhead);
0999 
1000   /* subtract from budget of current */
1001   if (cur->b < tdelta) {
1002     u = cur->b;
1003     cur->sched_overhead = tdelta - cur->b;
1004     overhead = cur->sched_overhead;
1005     percpu_write64 (pcpu_overhead, overhead);
1006   }
1007 
1008   cur->b -= u;
1009 
1010   cur->usage += tdelta;
1011   if (!cur->runnable || cur->b <= 0) {
1012     /* if blocked or exhausted, advance eligibility time */
1013     cur->io.e += div64_64 (cur->usage * cur->io.Uden, cur->io.Unum);
1014     /* schedule next replenishment */
1015     if (cur->io.r.t == 0) {
1016       cur->io.r.t = cur->io.e;
1017       cur->io.r.b = div64_64 (cur->T * cur->io.Unum, cur->io.Uden);
1018     } else {
1019       /* or update existing replenishment with new eligibility time */
1020       cur->io.r.t = cur->io.e;
1021     }
1022     cur->prev_usage = cur->usage;
1023     cur->usage = 0;
1024 
1025     cur->b = 0;
1026     if (!cur->runnable)
1027       /* we were blocked, reset budgeted state */
1028       cur->io.budgeted = FALSE;
1029   }
1030 }
1031 
1032 extern void
1033 io_vcpu_unblock (vcpu *v)
1034 {
1035   u64 now;
1036   RDTSC (now);
1037 
1038   if (!v->io.budgeted && v->io.e < now)
1039     /* eligibility time is in the past, push it up to now */
1040     v->io.e = now;
1041 
1042   u64 Cmax = div64_64 (v->T * v->io.Unum, v->io.Uden);
1043 
1044   if (v->io.r.t == 0) {
1045     if (!v->io.budgeted) {
1046       v->io.r.t = v->io.e;
1047       v->io.r.b = Cmax;
1048     }
1049   } else {
1050     v->io.r.b = Cmax;
1051   }
1052   v->io.budgeted = TRUE;
1053 }
1054 
1055 extern void
1056 iovcpu_job_wakeup (task_id job, u64 T)
1057 {
1058   quest_tss *tssp = lookup_TSS (job);
1059   vcpu *v = vcpu_lookup (tssp->cpu);
1060   if (v->type == IO_VCPU) {
1061     if (T < v->T || !(v->running || v->runnable))
1062       v->T = T;
1063   }
1064   wakeup (job);
1065 }
1066 
1067 extern void
1068 iovcpu_job_wakeup_for_me (task_id job)
1069 {
1070   vcpu *cur = percpu_read (vcpu_current);
1071   if (cur)
1072     iovcpu_job_wakeup (job, cur->T);
1073   else
1074     wakeup (job);
1075 }
1076 
1077 extern void
1078 iovcpu_job_completion (void)
1079 {
1080   schedule ();
1081 }
1082 
1083 extern uint
1084 count_set_bits (u32 v)
1085 {
1086   /* Wegner's method */
1087   u32 c;
1088   for (c = 0; v; c++) {
1089     v &= v - 1;              /* clear the least significant bit set */
1090   }
1091   return c;
1092 }
1093 
1094 extern uint
1095 select_iovcpu (iovcpu_class class)
1096 {
1097   uint i, idx = lowest_priority_vcpu (), matches = 0;
1098   for (i=0; i<NUM_VCPUS; i++) {
1099     struct vcpu_params *p = &init_params[i];
1100     if (p->type == IO_VCPU) {
1101       u32 m = count_set_bits (p->class & class);
1102       if (m >= matches) {
1103         idx = i;
1104         matches = m;
1105       }
1106     }
1107   }
1108   return idx;
1109 }
1110 
1111 extern void
1112 set_iovcpu (task_id task, iovcpu_class class)
1113 {
1114   uint i = select_iovcpu (class);
1115 #if 0
1116   logger_printf ("iovcpu: task 0x%x requested class 0x%x and got IO-VCPU %d\n",
1117                  task, class, i);
1118 #endif
1119   lookup_TSS (task)->cpu = i;
1120 }
1121 
1122 static vcpu_hooks io_vcpu_hooks = {
1123   .update_replenishments = io_vcpu_update_replenishments,
1124   .next_event = io_vcpu_next_event,
1125   .end_timeslice = io_vcpu_end_timeslice,
1126   .unblock = io_vcpu_unblock
1127 };
1128 
1129 /* ************************************************** */
1130 
1131 static vcpu_hooks *vcpu_hooks_table[] = {
1132   [MAIN_VCPU] = &main_vcpu_hooks,
1133   [IO_VCPU] = &io_vcpu_hooks
1134 };
1135 
1136 extern bool
1137 vcpu_init (void)
1138 {
1139   uint eax, ecx;
1140 
1141   cpuid (1, 0, NULL, NULL, &ecx, NULL);
1142   cpuid (6, 0, &eax, NULL, NULL, NULL);
1143 
1144   logger_printf ("vcpu: init num_vcpus=%d num_cpus=%d TSC_deadline=%s ARAT=%s\n",
1145                  NUM_VCPUS, mp_num_cpus,
1146                  (ecx & (1 << 24)) ? "yes" : "no",
1147                  (eax & (1 << 2))  ? "yes" : "no");
1148 
1149   memset (vcpus, 0, sizeof(vcpus));
1150 
1151   int cpu_i=0, vcpu_i;
1152   vcpu *vcpu;
1153 
1154   RDTSC (vcpu_init_time);
1155   tsc_freq_msec = div_u64_u32_u32 (tsc_freq, 1000);
1156   tsc_unit_freq = div_u64_u32_u32 (tsc_freq, UNITS_PER_SEC);
1157   logger_printf ("vcpu: tsc_freq_msec=0x%X unit_freq=0x%X\n",
1158                  tsc_freq_msec, tsc_unit_freq);
1159 
1160   /* distribute VCPUs across PCPUs */
1161   for (vcpu_i=0; vcpu_i<NUM_VCPUS; vcpu_i++) {
1162     u32 C = init_params[vcpu_i].C;
1163     u32 T = init_params[vcpu_i].T;
1164     vcpu_type type = init_params[vcpu_i].type;
1165     vcpu = vcpu_lookup (vcpu_i);
1166     vcpu->cpu = cpu_i++;
1167     if (cpu_i >= mp_num_cpus)
1168       cpu_i = 0;
1169     vcpu->quantum = div_u64_u32_u32 (tsc_freq, QUANTUM_HZ);
1170     vcpu->C = C * tsc_unit_freq;
1171     vcpu->T = T * tsc_unit_freq;
1172     vcpu->type = type;
1173 #ifndef SPORADIC_IO
1174     if (vcpu->type == MAIN_VCPU) {
1175       repl_queue_add (&vcpu->main.Q, vcpu->C, vcpu_init_time);
1176     } else if (vcpu->type == IO_VCPU) {
1177       vcpu->io.Unum = C;
1178       vcpu->io.Uden = T;
1179       vcpu->b = vcpu->C;
1180     }
1181     vcpu->hooks = vcpu_hooks_table[type];
1182 #else
1183     repl_queue_add (&vcpu->main.Q, vcpu->C, vcpu_init_time);
1184     vcpu->hooks = vcpu_hooks_table[MAIN_VCPU];
1185 #endif
1186 
1187     logger_printf ("vcpu: %svcpu=%d pcpu=%d C=0x%llX T=0x%llX U=%d%%\n",
1188                    type == IO_VCPU ? "IO " : "",
1189                    vcpu_i, vcpu->cpu, vcpu->C, vcpu->T, (C * 100) / T);
1190   }
1191   return TRUE;
1192 }
1193 
1194 /* ************************************************** */
1195 
1196 #include "module/header.h"
1197 
1198 static const struct module_ops mod_ops = {
1199   .init = vcpu_init
1200 };
1201 
1202 DEF_MODULE (sched___vcpu, "VCPU scheduler", &mod_ops, {});
1203 
1204 /*
1205  * Local Variables:
1206  * indent-tabs-mode: nil
1207  * mode: C
1208  * c-file-style: "gnu"
1209  * c-basic-offset: 2
1210  * End:
1211  */
1212 
1213 /* vi: set et sw=2 sts=2: */