11import { browser } from "$app/environment" ;
22import posthog from "posthog-js" ;
3- import type { LanguageRegistration , ShikiTransformer } from "shiki" ;
3+ import { type LanguageRegistration , type ShikiTransformer , isPlainLang } from "shiki" ;
4+ import { dlog } from "$lib/debug" ;
45
56/**
67 * Pre-load the languages by returning regular expressions from language
78 * registrations.
89 *
910 * @param languages a set of languages and their associated registrations.
11+ * Languages must be sorted from the least to the most weighted.
1012 * @returns a set of languages and their associated regular expressions to test code against.
1113 */
1214export function loadLanguages (
@@ -59,27 +61,61 @@ export function detectLanguage(
5961 let highestRate = 0 ;
6062 let highestTotal = 0 ;
6163
64+ const trimmed = code . trim ( ) ;
65+ if ( ! trimmed ) return languageCandidate ;
66+
67+ dlog ( "===== Starting determining language for:" ) ;
68+ dlog ( code ) ;
69+ dlog ( "=====" ) ;
70+
71+ // try detecting language based off of first line comment
72+ const firstLine = trimmed . split ( "\n" ) . shift ( ) ?. trim ( ) ;
73+ if ( firstLine ) {
74+ const isComment =
75+ ( firstLine . startsWith ( "//" ) || firstLine . startsWith ( "#" ) ) && ! firstLine . includes ( " " ) ;
76+ if ( isComment ) {
77+ dlog ( `First line comment: ${ firstLine } ` ) ;
78+ const firstSplit = firstLine . split ( "." ) ;
79+ if ( firstSplit . length ) {
80+ const extension = firstSplit . pop ( ) ;
81+ if ( extension && Object . keys ( languages ) . includes ( extension ) ) {
82+ dlog ( `Found valid language from first comment: ${ extension } ` ) ;
83+ return extension ;
84+ }
85+ }
86+ }
87+ }
88+
89+ // otherwise, loop over regexes
6290 for ( const [ language , regexps ] of Object . entries ( languages ) ) {
6391 if ( ! regexps . length ) continue ;
64- const matchesCount = regexps
65- . map ( regexp => {
66- try {
67- return code . match ( regexp ) ?. length ?? 0 ;
68- } catch {
69- return 0 ;
70- }
71- } )
72- . reduce ( ( acc , b ) => acc + b , 0 ) ;
73- const successRate = matchesCount / regexps . length ;
92+ const compute = regexps . map < { matches : boolean ; count : number } > ( regexp => {
93+ try {
94+ const match = code . match ( regexp ) ;
95+ return { matches : ! ! match , count : match ?. length ?? 0 } ;
96+ } catch {
97+ return { matches : false , count : 0 } ;
98+ }
99+ } ) ;
100+ const matchesLength = compute . reduce ( ( acc , item ) => acc + item . count , 0 ) ;
101+ const matchesCount = compute . filter ( item => item . matches ) . length ;
102+ const successRate = matchesLength / matchesCount ;
103+ dlog (
104+ `[${ language } ]\t${ matchesLength } on ${ matchesCount } regexes matches over ${ regexps . length } regexes - success rate: ${ Math . round ( ( successRate * 100 + Number . EPSILON ) * 100 ) / 100 } %`
105+ ) ;
74106 if (
75107 successRate > highestRate ||
76108 ( successRate === highestRate && regexps . length > highestTotal )
77109 ) {
110+ dlog (
111+ `New candidate found! Previous values: ${ languageCandidate } - highest rate ${ highestRate } , highest total regexes: ${ highestTotal } `
112+ ) ;
78113 languageCandidate = language ;
79114 highestRate = successRate ;
80115 highestTotal = regexps . length ;
81116 }
82117 }
118+ dlog ( `Done: result is ${ languageCandidate } ` ) ;
83119 return languageCandidate ;
84120}
85121
@@ -103,6 +139,8 @@ export function transformerLanguageDetection(
103139 return {
104140 preprocess ( code , options ) {
105141 if ( options . lang === "diff" ) {
142+ // tests:
143+ // - /issues/sveltejs/svelte/14280
106144 const cleanedCode = code
107145 . split ( "\n" )
108146 . map ( line => line . replace ( / ^ [ + - ] / , "" ) )
@@ -117,8 +155,16 @@ export function transformerLanguageDetection(
117155 }
118156 options . lang = detectedLanguage ;
119157 if ( options . meta ) options . meta [ "data-detected" ] = true ;
120- return code ;
158+ } else if ( isPlainLang ( options . lang ) ) {
159+ // tests:
160+ // - /issues/sveltejs/svelte/16072
161+ const detectedLanguage = detectLanguage ( code , languages ) ;
162+ if ( detectedLanguage ) {
163+ options . lang = detectedLanguage ;
164+ if ( options . meta ) options . meta [ "data-detected" ] = true ;
165+ }
121166 }
167+ return code ;
122168 } ,
123169 pre ( node ) {
124170 node . properties [ "data-language" ] = this . options . lang
0 commit comments