kvmclock.c 6.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288
  1. #include "libcflat.h"
  2. #include "smp.h"
  3. #include "atomic.h"
  4. #include "processor.h"
  5. #include "kvmclock.h"
  6. #include "asm/barrier.h"
  7. #define unlikely(x) __builtin_expect(!!(x), 0)
  8. #define likely(x) __builtin_expect(!!(x), 1)
  9. struct pvclock_vcpu_time_info __attribute__((aligned(4))) hv_clock[MAX_CPU];
  10. struct pvclock_wall_clock wall_clock;
  11. static unsigned char valid_flags = 0;
  12. static atomic64_t last_value = ATOMIC64_INIT(0);
  13. /*
  14. * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
  15. * yielding a 64-bit result.
  16. */
  17. static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift)
  18. {
  19. u64 product;
  20. #ifdef __i386__
  21. u32 tmp1, tmp2;
  22. #endif
  23. if (shift < 0)
  24. delta >>= -shift;
  25. else
  26. delta <<= shift;
  27. #ifdef __i386__
  28. __asm__ (
  29. "mul %5 ; "
  30. "mov %4,%%eax ; "
  31. "mov %%edx,%4 ; "
  32. "mul %5 ; "
  33. "xor %5,%5 ; "
  34. "add %4,%%eax ; "
  35. "adc %5,%%edx ; "
  36. : "=A" (product), "=r" (tmp1), "=r" (tmp2)
  37. : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) );
  38. #elif defined(__x86_64__)
  39. __asm__ (
  40. "mul %%rdx ; shrd $32,%%rdx,%%rax"
  41. : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) );
  42. #else
  43. #error implement me!
  44. #endif
  45. return product;
  46. }
  47. #ifdef __i386__
  48. # define do_div(n,base) ({ \
  49. u32 __base = (base); \
  50. u32 __rem; \
  51. __rem = ((u64)(n)) % __base; \
  52. (n) = ((u64)(n)) / __base; \
  53. __rem; \
  54. })
  55. #else
  56. u32 __attribute__((weak)) __div64_32(u64 *n, u32 base)
  57. {
  58. u64 rem = *n;
  59. u64 b = base;
  60. u64 res, d = 1;
  61. u32 high = rem >> 32;
  62. /* Reduce the thing a bit first */
  63. res = 0;
  64. if (high >= base) {
  65. high /= base;
  66. res = (u64) high << 32;
  67. rem -= (u64) (high*base) << 32;
  68. }
  69. while ((s64)b > 0 && b < rem) {
  70. b = b+b;
  71. d = d+d;
  72. }
  73. do {
  74. if (rem >= b) {
  75. rem -= b;
  76. res += d;
  77. }
  78. b >>= 1;
  79. d >>= 1;
  80. } while (d);
  81. *n = res;
  82. return rem;
  83. }
  84. # define do_div(n,base) ({ \
  85. u32 __base = (base); \
  86. u32 __rem; \
  87. (void)(((typeof((n)) *)0) == ((u64 *)0)); \
  88. if (likely(((n) >> 32) == 0)) { \
  89. __rem = (u32)(n) % __base; \
  90. (n) = (u32)(n) / __base; \
  91. } else \
  92. __rem = __div64_32(&(n), __base); \
  93. __rem; \
  94. })
  95. #endif
  96. /**
  97. * set_normalized_timespec - set timespec sec and nsec parts and normalize
  98. *
  99. * @ts: pointer to timespec variable to be set
  100. * @sec: seconds to set
  101. * @nsec: nanoseconds to set
  102. *
  103. * Set seconds and nanoseconds field of a timespec variable and
  104. * normalize to the timespec storage format
  105. *
  106. * Note: The tv_nsec part is always in the range of
  107. * 0 <= tv_nsec < NSEC_PER_SEC
  108. * For negative values only the tv_sec field is negative !
  109. */
  110. void set_normalized_timespec(struct timespec *ts, long sec, s64 nsec)
  111. {
  112. while (nsec >= NSEC_PER_SEC) {
  113. /*
  114. * The following asm() prevents the compiler from
  115. * optimising this loop into a modulo operation. See
  116. * also __iter_div_u64_rem() in include/linux/time.h
  117. */
  118. asm("" : "+rm"(nsec));
  119. nsec -= NSEC_PER_SEC;
  120. ++sec;
  121. }
  122. while (nsec < 0) {
  123. asm("" : "+rm"(nsec));
  124. nsec += NSEC_PER_SEC;
  125. --sec;
  126. }
  127. ts->tv_sec = sec;
  128. ts->tv_nsec = nsec;
  129. }
  130. static inline
  131. unsigned pvclock_read_begin(const struct pvclock_vcpu_time_info *src)
  132. {
  133. unsigned version = src->version & ~1;
  134. /* Make sure that the version is read before the data. */
  135. smp_rmb();
  136. return version;
  137. }
  138. static inline
  139. bool pvclock_read_retry(const struct pvclock_vcpu_time_info *src,
  140. unsigned version)
  141. {
  142. /* Make sure that the version is re-read after the data. */
  143. smp_rmb();
  144. return version != src->version;
  145. }
  146. static inline u64 rdtsc_ordered()
  147. {
  148. /*
  149. * FIXME: on Intel CPUs rmb() aka lfence is sufficient which brings up
  150. * to 2x speedup
  151. */
  152. mb();
  153. return rdtsc();
  154. }
  155. static inline
  156. cycle_t __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src)
  157. {
  158. u64 delta = rdtsc_ordered() - src->tsc_timestamp;
  159. cycle_t offset = scale_delta(delta, src->tsc_to_system_mul,
  160. src->tsc_shift);
  161. return src->system_time + offset;
  162. }
  163. cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src)
  164. {
  165. unsigned version;
  166. cycle_t ret;
  167. u64 last;
  168. u8 flags;
  169. do {
  170. version = pvclock_read_begin(src);
  171. ret = __pvclock_read_cycles(src);
  172. flags = src->flags;
  173. } while (pvclock_read_retry(src, version));
  174. if ((valid_flags & PVCLOCK_RAW_CYCLE_BIT) ||
  175. ((valid_flags & PVCLOCK_TSC_STABLE_BIT) &&
  176. (flags & PVCLOCK_TSC_STABLE_BIT)))
  177. return ret;
  178. /*
  179. * Assumption here is that last_value, a global accumulator, always goes
  180. * forward. If we are less than that, we should not be much smaller.
  181. * We assume there is an error marging we're inside, and then the correction
  182. * does not sacrifice accuracy.
  183. *
  184. * For reads: global may have changed between test and return,
  185. * but this means someone else updated poked the clock at a later time.
  186. * We just need to make sure we are not seeing a backwards event.
  187. *
  188. * For updates: last_value = ret is not enough, since two vcpus could be
  189. * updating at the same time, and one of them could be slightly behind,
  190. * making the assumption that last_value always go forward fail to hold.
  191. */
  192. last = atomic64_read(&last_value);
  193. do {
  194. if (ret < last)
  195. return last;
  196. last = atomic64_cmpxchg(&last_value, last, ret);
  197. } while (unlikely(last != ret));
  198. return ret;
  199. }
  200. cycle_t kvm_clock_read()
  201. {
  202. struct pvclock_vcpu_time_info *src;
  203. cycle_t ret;
  204. int index = smp_id();
  205. src = &hv_clock[index];
  206. ret = pvclock_clocksource_read(src);
  207. return ret;
  208. }
  209. void kvm_clock_init(void *data)
  210. {
  211. int index = smp_id();
  212. struct pvclock_vcpu_time_info *hvc = &hv_clock[index];
  213. printf("kvm-clock: cpu %d, msr %p\n", index, hvc);
  214. wrmsr(MSR_KVM_SYSTEM_TIME_NEW, (unsigned long)hvc | 1);
  215. }
  216. void kvm_clock_clear(void *data)
  217. {
  218. wrmsr(MSR_KVM_SYSTEM_TIME_NEW, 0LL);
  219. }
  220. void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock,
  221. struct pvclock_vcpu_time_info *vcpu_time,
  222. struct timespec *ts)
  223. {
  224. u32 version;
  225. u64 delta;
  226. struct timespec now;
  227. /* get wallclock at system boot */
  228. do {
  229. version = wall_clock->version;
  230. rmb(); /* fetch version before time */
  231. now.tv_sec = wall_clock->sec;
  232. now.tv_nsec = wall_clock->nsec;
  233. rmb(); /* fetch time before checking version */
  234. } while ((wall_clock->version & 1) || (version != wall_clock->version));
  235. delta = pvclock_clocksource_read(vcpu_time); /* time since system boot */
  236. delta += now.tv_sec * (u64)NSEC_PER_SEC + now.tv_nsec;
  237. now.tv_nsec = do_div(delta, NSEC_PER_SEC);
  238. now.tv_sec = delta;
  239. set_normalized_timespec(ts, now.tv_sec, now.tv_nsec);
  240. }
  241. void kvm_get_wallclock(struct timespec *ts)
  242. {
  243. struct pvclock_vcpu_time_info *vcpu_time;
  244. int index = smp_id();
  245. wrmsr(MSR_KVM_WALL_CLOCK_NEW, (unsigned long)&wall_clock);
  246. vcpu_time = &hv_clock[index];
  247. pvclock_read_wallclock(&wall_clock, vcpu_time, ts);
  248. }
  249. void pvclock_set_flags(unsigned char flags)
  250. {
  251. valid_flags = flags;
  252. }